<!DOCTYPE HTML>
<html lang="zh-CN">


<head>
    <meta name="baidu-site-verification" content="code-KNXLvfbWBj" />
    <meta charset="utf-8">
    <meta name="keywords" content="数据采集笔记, Aunean&#39;s Blog">
    <meta name="description" content="">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
    <meta name="renderer" content="webkit|ie-stand|ie-comp">
    <meta name="mobile-web-app-capable" content="yes">
    <meta name="format-detection" content="telephone=no">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
    <!-- Global site tag (gtag.js) - Google Analytics -->


    <title>数据采集笔记 | Aunean&#39;s Blog</title>
    <link rel="icon" type="image/png" href="/favicon.png">

    <link rel="stylesheet" type="text/css" href="/libs/awesome/css/all.css">
    <link rel="stylesheet" type="text/css" href="/libs/materialize/materialize.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/aos/aos.css">
    <link rel="stylesheet" type="text/css" href="/libs/animate/animate.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/lightGallery/css/lightgallery.min.css">
    <link rel="stylesheet" type="text/css" href="/css/matery.css">
    <link rel="stylesheet" type="text/css" href="/css/my.css">

    <script src="/libs/jquery/jquery.min.js"></script>

<meta name="generator" content="Hexo 5.4.0">
<style>.github-emoji { position: relative; display: inline-block; width: 1.2em; min-height: 1.2em; overflow: hidden; vertical-align: top; color: transparent; }  .github-emoji > span { position: relative; z-index: 10; }  .github-emoji img, .github-emoji .fancybox { margin: 0 !important; padding: 0 !important; border: none !important; outline: none !important; text-decoration: none !important; user-select: none !important; cursor: auto !important; }  .github-emoji img { height: 1.2em !important; width: 1.2em !important; position: absolute !important; left: 50% !important; top: 50% !important; transform: translate(-50%, -50%) !important; user-select: none !important; cursor: auto !important; } .github-emoji-fallback { color: inherit; } .github-emoji-fallback img { opacity: 0 !important; }</style>
<link rel="alternate" href="/atom.xml" title="Aunean's Blog" type="application/atom+xml">
<link rel="stylesheet" href="/css/prism-tomorrow.css" type="text/css"></head>





<body>
    <header class="navbar-fixed">
    <nav id="headNav" class="bg-color nav-transparent">
        <div id="navContainer" class="nav-wrapper container">
            <div class="brand-logo">
                <a href="/" class="waves-effect waves-light">
                    
                    <img src="/medias/logo.png" class="logo-img" alt="LOGO">
                    
                    <span class="logo-span">Aunean&#39;s Blog</span>
                </a>
            </div>
            

<a href="#" data-target="mobile-nav" class="sidenav-trigger button-collapse"><i class="fas fa-bars"></i></a>
<ul class="right nav-menu">
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/" class="waves-effect waves-light">
      
      <i class="fas fa-home" style="zoom: 0.6;"></i>
      
      <span>首页</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/tags" class="waves-effect waves-light">
      
      <i class="fas fa-tags" style="zoom: 0.6;"></i>
      
      <span>标签</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/categories" class="waves-effect waves-light">
      
      <i class="fas fa-bookmark" style="zoom: 0.6;"></i>
      
      <span>分类</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/archives" class="waves-effect waves-light">
      
      <i class="fas fa-archive" style="zoom: 0.6;"></i>
      
      <span>归档</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/about" class="waves-effect waves-light">
      
      <i class="fas fa-user-circle" style="zoom: 0.6;"></i>
      
      <span>关于</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/contact" class="waves-effect waves-light">
      
      <i class="fas fa-comments" style="zoom: 0.6;"></i>
      
      <span>留言板</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/friends" class="waves-effect waves-light">
      
      <i class="fas fa-address-book" style="zoom: 0.6;"></i>
      
      <span>友情链接</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/navigate" class="waves-effect waves-light">
      
      <i class="fas fa-location-arrow" style="zoom: 0.6;"></i>
      
      <span>导航</span>
    </a>
    
  </li>
  
  <li>
    <a href="#searchModal" class="modal-trigger waves-effect waves-light">
      <i id="searchIcon" class="fas fa-search" title="搜索" style="zoom: 0.85;"></i>
    </a>
  </li>
</ul>


<div id="mobile-nav" class="side-nav sidenav">

    <div class="mobile-head bg-color">
        
        <img src="/medias/logo.png" class="logo-img circle responsive-img">
        
        <div class="logo-name">Aunean&#39;s Blog</div>
        <div class="logo-desc">
            
            Never really desperate, only the lost of the soul.
            
        </div>
    </div>

    

    <ul class="menu-list mobile-menu-list">
        
        <li class="m-nav-item">
	  
		<a href="/" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-home"></i>
			
			首页
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/tags" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-tags"></i>
			
			标签
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/categories" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-bookmark"></i>
			
			分类
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/archives" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-archive"></i>
			
			归档
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/about" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-user-circle"></i>
			
			关于
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/contact" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-comments"></i>
			
			留言板
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/friends" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-address-book"></i>
			
			友情链接
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/navigate" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-location-arrow"></i>
			
			导航
		</a>
          
        </li>
        
        
        <li><div class="divider"></div></li>
        <li>
            <a href="https://github.com/Aunean-ls" class="waves-effect waves-light" target="_blank">
                <i class="fab fa-github-square fa-fw"></i>Fork Me
            </a>
        </li>
        
    </ul>
</div>


        </div>

        
            <style>
    .nav-transparent .github-corner {
        display: none !important;
    }

    .github-corner {
        position: absolute;
        z-index: 10;
        top: 0;
        right: 0;
        border: 0;
        transform: scale(1.1);
    }

    .github-corner svg {
        color: #0f9d58;
        fill: #fff;
        height: 64px;
        width: 64px;
    }

    .github-corner:hover .octo-arm {
        animation: a 0.56s ease-in-out;
    }

    .github-corner .octo-arm {
        animation: none;
    }

    @keyframes a {
        0%,
        to {
            transform: rotate(0);
        }
        20%,
        60% {
            transform: rotate(-25deg);
        }
        40%,
        80% {
            transform: rotate(10deg);
        }
    }
</style>

<a href="https://github.com/Aunean-ls" class="github-corner tooltipped hide-on-med-and-down" target="_blank"
   data-tooltip="Fork Me" data-position="left" data-delay="50">
    <svg viewBox="0 0 250 250" aria-hidden="true">
        <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path>
        <path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2"
              fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path>
        <path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z"
              fill="currentColor" class="octo-body"></path>
    </svg>
</a>
        
    </nav>

</header>

    
<script src="/libs/cryptojs/crypto-js.min.js"></script>
<script>
    (function() {
        let pwd = '';
        if (pwd && pwd.length > 0) {
            if (pwd !== CryptoJS.SHA256(prompt('请输入访问本文章的密码')).toString(CryptoJS.enc.Hex)) {
                alert('密码错误，将返回主页！');
                location.href = '/';
            }
        }
    })();
</script>




<div class="bg-cover pd-header post-cover" style="background-image: url('https://cdn.jsdelivr.net/gh/Aunean-ls/pic/img/u001.webp')">
    <div class="container" style="right: 0px;left: 0px;">
        <div class="row">
            <div class="col s12 m12 l12">
                <div class="brand">
                    <h1 class="description center-align post-title">数据采集笔记</h1>
                </div>
            </div>
        </div>
    </div>
</div>




<main class="post-container content">

    
    <link rel="stylesheet" href="/libs/tocbot/tocbot.css">
<style>
    #articleContent h1::before,
    #articleContent h2::before,
    #articleContent h3::before,
    #articleContent h4::before,
    #articleContent h5::before,
    #articleContent h6::before {
        display: block;
        content: " ";
        height: 100px;
        margin-top: -100px;
        visibility: hidden;
    }

    #articleContent :focus {
        outline: none;
    }

    .toc-fixed {
        position: fixed;
        top: 64px;
    }

    .toc-widget {
        width: 345px;
        padding-left: 20px;
    }

    .toc-widget .toc-title {
        padding: 35px 0 15px 17px;
        font-size: 1.5rem;
        font-weight: bold;
        line-height: 1.5rem;
    }

    .toc-widget ol {
        padding: 0;
        list-style: none;
    }

    #toc-content {
        padding-bottom: 30px;
        overflow: auto;
    }

    #toc-content ol {
        padding-left: 10px;
    }

    #toc-content ol li {
        padding-left: 10px;
    }

    #toc-content .toc-link:hover {
        color: #42b983;
        font-weight: 700;
        text-decoration: underline;
    }

    #toc-content .toc-link::before {
        background-color: transparent;
        max-height: 25px;

        position: absolute;
        right: 23.5vw;
        display: block;
    }

    #toc-content .is-active-link {
        color: #42b983;
    }

    #floating-toc-btn {
        position: fixed;
        right: 15px;
        bottom: 76px;
        padding-top: 15px;
        margin-bottom: 0;
        z-index: 998;
    }

    #floating-toc-btn .btn-floating {
        width: 48px;
        height: 48px;
    }

    #floating-toc-btn .btn-floating i {
        line-height: 48px;
        font-size: 1.4rem;
    }
</style>
<div class="row">
    <div id="main-content" class="col s12 m12 l9">
        <!-- 文章内容详情 -->
<div id="artDetail">
    <div class="card">
        <div class="card-content article-info">
            <div class="row tag-cate">
                <div class="col s7">
                    
                    <div class="article-tag">
                        
                            <a href="/tags/python%E7%88%AC%E8%99%AB/">
                                <span class="chip bg-color">python爬虫</span>
                            </a>
                        
                    </div>
                    
                </div>
                <div class="col s5 right-align">
                    
                    <div class="post-cate">
                        <i class="fas fa-bookmark fa-fw icon-category"></i>
                        
                            <a href="/categories/%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/" class="post-category">
                                学习笔记
                            </a>
                        
                    </div>
                    
                </div>
            </div>

            <div class="post-info">
                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-minus fa-fw"></i>发布日期:&nbsp;&nbsp;
                    2021-04-11
                </div>
                

                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-check fa-fw"></i>更新日期:&nbsp;&nbsp;
                    2021-09-23
                </div>
                

                
                <div class="info-break-policy">
                    <i class="far fa-file-word fa-fw"></i>文章字数:&nbsp;&nbsp;
                    13.7k
                </div>
                

                

                
            </div>
        </div>
        <hr class="clearfix">

        

        

        <div class="card-content article-card-content">
            <div id="articleContent">
                <h1 id="第一章、爬虫基础"><a href="#第一章、爬虫基础" class="headerlink" title="第一章、爬虫基础"></a>第一章、爬虫基础</h1><h2 id="1-1-HTTP-基本原理"><a href="#1-1-HTTP-基本原理" class="headerlink" title="1.1 HTTP 基本原理"></a>1.1 HTTP 基本原理</h2><h3 id="1-1-1-URI-和-URL"><a href="#1-1-1-URI-和-URL" class="headerlink" title="1.1.1 URI 和 URL"></a>1.1.1 URI 和 URL</h3><ul>
<li><p>URI 的全称为 Uniform Resource Identifier ，统一资源标志符</p>
</li>
<li><p>URL 称为 Universal Resource Locator，统一资源定位符</p>
</li>
<li><p>URL 是 URI 的子集，也就是说每个 URL 都是 URI ，但不是每个 URI 都是 URL。</p>
</li>
</ul>
<h3 id="1-1-2-超文本"><a href="#1-1-2-超文本" class="headerlink" title="1.1.2 超文本"></a>1.1.2 超文本</h3><ul>
<li>超文本，其英文名称叫作 hypertext ，我们在浏览器里看到的网页就是超文本解析而成的， 网页源代码是一系列系列 HTML 代码，而网页的源代码 HTML 就可以称作超文本。</li>
</ul>
<h3 id="1-1-3-HTTP-和-HTTPS"><a href="#1-1-3-HTTP-和-HTTPS" class="headerlink" title="1.1.3 HTTP 和 HTTPS"></a>1.1.3 HTTP 和 HTTPS</h3><ul>
<li>HTTP 全称是 Hyper Text Transfer Protocol ，中文 叫作超文本传输协议 HTTP 协议是用于从网络传输超文本数据到本地浏览器的传送协议，它能保证高效而准确地传送超文本文档。</li>
<li>HTTPS 的全称是 Hyper Text Transfer Protocol over Secure Socket Layer ，是以安全为目标的 HTTP 通道，简单讲是 HTTP 的安全版， HTTP 下加入 SSL  层，简称为 HTTPS。</li>
</ul>
<h3 id="1-1-4-请求"><a href="#1-1-4-请求" class="headerlink" title="1.1.4 请求"></a>1.1.4 请求</h3><ul>
<li>请求，由客户端向服务端发出，可以分为 部分内容：请求方法 (Request Method)、请求的网址 (Request URL ）、请求头 (Request Headers)、请求体 (Request Body)。</li>
</ul>
<ol>
<li>请求方法<ul>
<li>常见的方法有两种：GET 和 POST。</li>
<li>GET 和 POST 请求方法有如下区别<ul>
<li>GET 请求中的参数包含在 URL 里面，数据可以在 URL 中看到，而 POST 请求的 URL 会包含这些数据，数据都是通过表单形式传输的，会包含在请求体中.</li>
<li>GET 请求提交的数据最多只有 1024 字节，而 POST 方式没有限制。</li>
</ul>
</li>
</ul>
</li>
</ol>
<table>
<thead>
<tr>
<th>方法</th>
<th>描述</th>
</tr>
</thead>
<tbody><tr>
<td>GET</td>
<td>请求页面，并返回页面内容</td>
</tr>
<tr>
<td>HEAD</td>
<td>类似于 GET 请求， 只不过返回的响应中没有具体的内容，用于获取报头</td>
</tr>
<tr>
<td>POST</td>
<td>大多用于提交表单或上传文件，数据包含在请求体中</td>
</tr>
<tr>
<td>PUT</td>
<td>从客户端向服务器传送的数据取代指定文梢中的内容</td>
</tr>
<tr>
<td>DELETE</td>
<td>请求服务器删除指定的页面</td>
</tr>
<tr>
<td>CONNECT</td>
<td>把服务器当作跳板，让服务器代替客户端防问其他网页</td>
</tr>
<tr>
<td>OPTIONS</td>
<td>允许客户端查看服务器的性能</td>
</tr>
<tr>
<td>TRACE</td>
<td>囚显服务器收到的请求，主要用于测试或诊断</td>
</tr>
</tbody></table>
<ol start="2">
<li>请求的网址</li>
</ol>
<ul>
<li>请求的网址，即统 资惊定位符 URL ，它可以唯一确定我们想请求的资源。</li>
</ul>
<ol start="3">
<li>请求头</li>
</ol>
<ul>
<li><p> Host ：用于指定请求资源的主机 IP 和端口号，其内容为请求 URL 的原始服务器或网关的位置，从 HTTP 1.1版本开始，请求必须包含此内容。</p>
</li>
<li><p>Cookie ：也常用复数形式 Cookies ，这是网站为了辨别用户进行会话跟踪而存储在用户本地的数据。它的主要功能是维持当前访问会话 例如，我们输入用户名和密码成功登录某个网站后，服务器会用会话保存登录状态信息，后面我们每次刷新或请求该站点的其他页面时，会发现都是登录状态，这就是 Cookies 的功 Cookies 里有信息标识了我们所对应的服务器的会话，每次浏览器在请求该站点的页面时，都会在请求头中加上 Cookies 并将其发送给服务器，服务器通过 Cookies 识别出是我们自己，并且查出当前状态是登录状态，所以返回结果就是登录之后才能看到的网页内容。</p>
</li>
<li><p>Referer ：此内容用来标识这个请求是从哪个页面发过来的，服务器可以拿到这 信息并做相应的处理，如做来源统计、防盗链处理等</p>
</li>
<li><p>User-Agent ：简称 UA ，它是一个特殊的字符串头，可以使服务器识别客户使用的操作系统及版本 浏览器及版本等信息 在做爬虫时加上此信息，可以伪装为浏览器；如果不加，很可能会被识别州为爬虫</p>
</li>
</ul>
<ol start="4">
<li>请求体</li>
</ol>
<ul>
<li>请求体一般承载的内容是 POST 请求中的表单数据，而对于 GET 请求，请求体则为空。</li>
</ul>
<h3 id="1-1-5-响应"><a href="#1-1-5-响应" class="headerlink" title="1.1.5 响应"></a>1.1.5 响应</h3><ul>
<li>响应，由服务端返回给客户端，可以分为 部分：响应状态码（ Response Status Code ）、响应头( Response Headers ）和响应体（ Response Body）</li>
</ul>
<ol>
<li>响应状态码</li>
</ol>
<p><img src="C:\Users\Aunean\AppData\Roaming\Typora\typora-user-images\image-20201116210501159.png" alt="image-20201116210501159"></p>
<p><img src="C:\Users\Aunean\AppData\Roaming\Typora\typora-user-images\image-20201116210517596.png" alt="image-20201116210517596"></p>
<h2 id="1-2-爬虫的基本原理"><a href="#1-2-爬虫的基本原理" class="headerlink" title="1.2 爬虫的基本原理"></a>1.2 爬虫的基本原理</h2><p>我们可以把互联网比作一张大网，而爬虫（即网络爬虫）便是在网上爬行的蜘蛛。把网的节点比作一个个网页，爬虫爬到这就相当于访问了该页面，获取了其信息。可以把节点间的连线比作网页与网页之间的链接关系，这样蜘蛛通过一个节点后，可以顺着节点连线继续爬行到达下一个节点，即通个网页继续获取后续的网页，这样整个网的节点便可以被蜘蛛全部爬行到，网站的数据就可以被抓取下来了。</p>
<h3 id="1-2-1-爬虫概述"><a href="#1-2-1-爬虫概述" class="headerlink" title="1.2.1 爬虫概述"></a>1.2.1 爬虫概述</h3><p>简单来说，爬虫就是获取网页并提取和保存信息的自动 程序，下面概要介绍一下</p>
<ol>
<li>获取网页<ul>
<li>爬虫首先要做的工作就是获取网页，这里就是获取网页的源代码 源代码里包含了网页的部分有用信息 ，所以只要把源代码获取下来，就可以从中提取想要的信息了。</li>
</ul>
</li>
<li>提取信息<ul>
<li>获取网页源代码后，接下来就是分析网页源代码，从中提取我们想要的数据。</li>
</ul>
</li>
<li>保存数据<ul>
<li>可以简单保存为 TXT 文本或 JSON 文本，也可以保存到数据库，如 MySQL MongoDB等。</li>
</ul>
</li>
<li>自动化程序<ul>
<li>说到自动化程序，意思是说爬虫可以代替人来完成这些操作。首先，我们手工当然可以提取这些信息，但是当量特别大或者想快速获取大量数据的话，肯定还是要借助程序。</li>
</ul>
</li>
</ol>
<h3 id="1-2-2-能抓取怎样的数据"><a href="#1-2-2-能抓取怎样的数据" class="headerlink" title="1.2.2 能抓取怎样的数据"></a>1.2.2 能抓取怎样的数据</h3><ol>
<li>最常见的便是常规网页，它们对应着 HTML 码，而最常抓取的便是 HTML 源代码。</li>
<li>JSON 字符串（其中 PI 接口大多采用这样的形式），这种格式的数据方便传输和解析，它们同样可以抓取，而且数据提取更加方便。</li>
<li>各种二进制数据，如图片 、视频和音频等 利用爬虫，我们可以将这些二进制数据抓取下来，然后保存成对应的文件名。</li>
</ol>
<h3 id="1-2-3-JavaScript-渲染页面"><a href="#1-2-3-JavaScript-渲染页面" class="headerlink" title="1.2.3 JavaScript 渲染页面"></a>1.2.3 JavaScript 渲染页面</h3><ul>
<li>有时候，在用 urllib 或 requests 抓取网页时，可以看到的源代码实际和浏览器中看到的不一样。</li>
<li>这是 个非常常见的问题 现在网页越来越多地采用 Ajax 、前端模块化工具来构建，整个网页可能都是由 JavaScript 渲染出来的，也就是说原始的 HTML 代码就是一个空壳。</li>
<li>对于这种，我们可以使用 Selenium、Splash 这样的库来实现模拟 JavaScript 渲染。</li>
<li>其次可以通过post请求。</li>
</ul>
<h1 id="第二章、基本库的使用"><a href="#第二章、基本库的使用" class="headerlink" title="第二章、基本库的使用"></a>第二章、基本库的使用</h1><h2 id="2-1-使用-urllib"><a href="#2-1-使用-urllib" class="headerlink" title="2.1 使用 urllib"></a>2.1 使用 urllib</h2><blockquote>
<p>首先，了解一下 urllib 库，python 内置的 HTTP 请求库，不需要额外的安装。</p>
<p>包含四个模块：</p>
<ul>
<li>request：最基本的 HTTP 请求模块，用来模拟发送请求。</li>
<li>error：异常处理模块，如果出现请求错误，可以捕获这些异常，然后进行重试或其他操作以保证程序不会意外终止。</li>
<li>parse：工具模块，提供了许多 URL 处理方法，比如拆分、解析、合并等。</li>
<li>robotparser ：主要是用来识别网站的 robots.txt 文件，然后判断哪些网站可以爬，哪些网站不可以爬，用的很少。</li>
</ul>
</blockquote>
<h3 id="2-1-1-发送请求"><a href="#2-1-1-发送请求" class="headerlink" title="2.1.1 发送请求"></a>2.1.1 发送请求</h3><ol>
<li><strong>urlopen()</strong></li>
</ol>
<p>urllib.request 模块提供了最基本的构造 HTTP 求的方法， 利用它可以模拟浏览器的一个请求发起过程， 同时它还带有处理授权验证（authenticaton）、重定向（ redirection ）、浏览器 Cookies 及其他内容。</p>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request

<span class="token comment" spellcheck="true"># 这里定义 response 变量接返回的 HTML</span>
response <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span><span class="token string">'https://www.python.org'</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># read() 读取为二进制的数据，再使用 decode() 方法把它转为字符串</span>
<span class="token comment" spellcheck="true"># 并设置字符编码为 utf-8 ，这里要注意，有些网页编码可能是 gb2312 或其他。</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li><p>下面我们看下 urlopen() 函数的 API：</p>
</li>
<li><p>urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None ) </p>
</li>
<li><p>data 参数：</p>
<p>data 参数是可选的。如果传递了这个参数，则它的请求方式就不再是 GET 方式，而POST 方式。</p>
</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> urllib<span class="token punctuation">.</span>parse
<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request

data <span class="token operator">=</span> bytes<span class="token punctuation">(</span>urllib<span class="token punctuation">.</span>parse<span class="token punctuation">.</span>urlencode<span class="token punctuation">(</span><span class="token punctuation">{</span><span class="token string">'word'</span><span class="token punctuation">:</span> <span class="token string">'hello'</span><span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
response <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/post'</span><span class="token punctuation">,</span> data<span class="token operator">=</span>data<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li><p>timeout 参数：</p>
<p>timeout 参数用于设置超时时间，单位为秒，意思就是如果请求超 了设置的这个时间， 还没有得到响应，就会抛出异常。如果不指定该参数，就会使用全局默认时间。</p>
<p>通过设置这个超时时间来控制一个网页如果长时间未 应，就跳过它的抓取。</p>
</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> socket
<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request
<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>error

<span class="token keyword">try</span><span class="token punctuation">:</span>
    response <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/get'</span><span class="token punctuation">,</span> timeout<span class="token operator">=</span><span class="token number">0.1</span><span class="token punctuation">)</span>
<span class="token keyword">except</span> urllib<span class="token punctuation">.</span>error<span class="token punctuation">.</span>URLError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># 可以用 isinstance() 方法来判断它的类型，作出更详细的异常判断</span>
    <span class="token keyword">if</span> isinstance<span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">,</span> socket<span class="token punctuation">.</span>timeout<span class="token punctuation">)</span><span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'TIME OUT'</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li><p>其他参数：</p>
<p>除了 data 参数和 timeout 参数外，还有 context 参数，它必须是 ssl.SSLContext 类型，用来指定SSL 设置。</p>
<p>此外，cafile 和 capath 这两个参数分别指定 CA 证书和它的路径，这个在请求 HTTPS 链接时会有用。</p>
<p>cadefault 参数现在已经弃用了，其默认值为 False。</p>
<p><a target="_blank" rel="noopener" href="https://docs.python.org/3/library/urllib.request.html">官方文档</a></p>
</li>
</ul>
<ol start="2">
<li><strong>Request</strong></li>
</ol>
<p>利用 urlopen() 方法可以实现最基本请求的发起，但这几个简单的参数并不足以构建个完整的请求 如果请求中需要加入 Headers 等信息，就可以利用更强大的 Request 类来构建。</p>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'user-agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'</span>
<span class="token punctuation">}</span>
request <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>Request<span class="token punctuation">(</span><span class="token string">'https://python.org'</span><span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span>
response <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span>request<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li><p>下面我们看下 Request 的构造方法</p>
<ul>
<li>class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)</li>
</ul>
</li>
<li><p>第一个参数 url 用于请求 URL 这是必传参数，其他都是可选参数。</p>
</li>
<li><p>第二个参数 data 如果要传，必须传 bytes （字节流）类型的 如果它是字典，可以先用urllib.parse 模块里的 urlencode() 编码。</p>
</li>
<li><p>第 个参数 headers 是一个字典，它就是请求头，我们可以在构造请求时通过 headers 参数直接构造，也可以通过调用请求实例的 add_header() 方法添加。</p>
</li>
<li><p>第四个参数 origin_req_host 指的是请求方的 host 名称或者 IP 地址。</p>
</li>
<li><p>第五个参数 unverifiable 表示这个请求是否是无法验证的，默认是 False ，意思就是说用户没有足够权限来选择接收这个请求的结果。</p>
</li>
<li><p>第六个参数 method 一个字符串 ，用来指示请求使用的方法，比如 GET POST PUT 等。</p>
</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request
<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>parse

url <span class="token operator">=</span> <span class="token string">'http://httpbin.org/post'</span>
headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span>
        <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'</span><span class="token punctuation">,</span>
    <span class="token string">'Host'</span><span class="token punctuation">:</span> <span class="token string">'httpbin.org'</span>
<span class="token punctuation">}</span>

dict <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'Germey'</span>
<span class="token punctuation">}</span>
data <span class="token operator">=</span> bytes<span class="token punctuation">(</span>urllib<span class="token punctuation">.</span>parse<span class="token punctuation">.</span>urlencode<span class="token punctuation">(</span>dict<span class="token punctuation">)</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
req <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>url<span class="token operator">=</span>url<span class="token punctuation">,</span> data<span class="token operator">=</span>data<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">,</span> method<span class="token operator">=</span><span class="token string">'POST'</span><span class="token punctuation">)</span>
response <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span>req<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<ol start="3">
<li><strong>高级用法</strong></li>
</ol>
<p>那对于一些更高级的操作（比如 Cookies 处理、代理设置等），该如何操作？</p>
<p>接下来，就需要更强大的工具 Handler 登场了。简而言之，我们可以把它理解为各种处理器，有专门处理登录验证的，有处理 Cookies 的，有处理代理设置的 利用它们，我们几乎可以做到 HTTP 请求中所有的事情。</p>
<p>urllib.request 模块里的 BaseHandler 类，它是所有其他 Handler 的父类，它提供了最基本的方法，例如 default_open()、protocol_request() 等。</p>
<p>接下来，就有各种 Ha dler 子类继承这个 BaseHandler 类，举例如下：</p>
<ul>
<li>HTTPDefaultErrorHandler：处理 HTTP 响应错误，错误都会抛出 HTTPError 类型的异常。</li>
<li>HTTPRedirectHandler：用于处理重定向。</li>
<li>HTTPCookieProcessor：用于处理 Cookies。</li>
<li>ProxyHandler：用于设置代理，默认代理为空。</li>
<li>HTTPPasswordMgr：用于管理密码，它维护了用户名和密码的表。</li>
<li>HTTPBasicAuthHandler 用于管理认证，如果一个链接打开时需要认证，那么可以用它来解决认证问题。</li>
</ul>
<p><a href="'https://docs.python.org/3/library/urllib.request.html#urllib.request.BaseHandler'">详见官方文档</a></p>
<ul>
<li>验证登陆，借助 HTTPBasicAuthHandler 可以完成</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> urllib<span class="token punctuation">.</span>request <span class="token keyword">import</span> HTTPPasswordMgrWithDefaultRealm<span class="token punctuation">,</span> HTTPBasicAuthHandler<span class="token punctuation">,</span> build_opener
<span class="token comment" spellcheck="true"># 对于异常的处理</span>
<span class="token keyword">from</span> urllib<span class="token punctuation">.</span>error <span class="token keyword">import</span> URLError

username <span class="token operator">=</span> <span class="token string">'username'</span>
password <span class="token operator">=</span> <span class="token string">'password'</span>
url <span class="token operator">=</span> <span class="token string">'https://www.zhihu.com/hot'</span>
p <span class="token operator">=</span> HTTPPasswordMgrWithDefaultRealm<span class="token punctuation">(</span><span class="token punctuation">)</span>
p<span class="token punctuation">.</span>add_password<span class="token punctuation">(</span>None<span class="token punctuation">,</span> url<span class="token punctuation">,</span> username<span class="token punctuation">,</span> password<span class="token punctuation">)</span>
auth_handler <span class="token operator">=</span> HTTPBasicAuthHandler<span class="token punctuation">(</span>p<span class="token punctuation">)</span>
opener <span class="token operator">=</span> build_opener<span class="token punctuation">(</span>auth_handler<span class="token punctuation">)</span>

<span class="token keyword">try</span><span class="token punctuation">:</span>
    result <span class="token operator">=</span> opener<span class="token punctuation">.</span>open<span class="token punctuation">(</span>url<span class="token punctuation">)</span>
    html <span class="token operator">=</span> result<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>html<span class="token punctuation">)</span>
<span class="token keyword">except</span> URLError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">)</span>
</code></pre>
<ul>
<li>代理</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> urllib<span class="token punctuation">.</span>error <span class="token keyword">import</span> URLError
<span class="token keyword">from</span> urllib<span class="token punctuation">.</span>request <span class="token keyword">import</span> ProxyHandler<span class="token punctuation">,</span> build_opener

<span class="token comment" spellcheck="true"># 这里的代理IP不能使用，网上的免费代理IP大多都不能使用。</span>
proxy_handler <span class="token operator">=</span> ProxyHandler<span class="token punctuation">(</span><span class="token punctuation">{</span>
    <span class="token string">'http'</span><span class="token punctuation">:</span> <span class="token string">'http://127.0.0.1:9743'</span><span class="token punctuation">,</span>
    <span class="token string">'https'</span><span class="token punctuation">:</span> <span class="token string">'https://127.0.0.1:9743'</span>
<span class="token punctuation">}</span><span class="token punctuation">)</span>
opener <span class="token operator">=</span> build_opener<span class="token punctuation">(</span>proxy_handler<span class="token punctuation">)</span>
<span class="token keyword">try</span><span class="token punctuation">:</span>
    response <span class="token operator">=</span> opener<span class="token punctuation">.</span>open<span class="token punctuation">(</span><span class="token string">'https://www.baidu.com'</span><span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">except</span> URLError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">)</span>
</code></pre>
<ul>
<li>Cookies</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 获取网站的Cookies</span>
<span class="token keyword">import</span> http<span class="token punctuation">.</span>cookiejar
<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request
cookie <span class="token operator">=</span> http<span class="token punctuation">.</span>cookiejar<span class="token punctuation">.</span>CookieJar<span class="token punctuation">(</span><span class="token punctuation">)</span>
handler <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>HTTPCookieProcessor<span class="token punctuation">(</span>cookie<span class="token punctuation">)</span>
opener <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>build_opener<span class="token punctuation">(</span>handler<span class="token punctuation">)</span>
response <span class="token operator">=</span> opener<span class="token punctuation">.</span>open<span class="token punctuation">(</span><span class="token string">'https://www.zhihu.com/hot'</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> item <span class="token keyword">in</span> cookie<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>item<span class="token punctuation">.</span>name<span class="token operator">+</span><span class="token string">'='</span><span class="token operator">+</span>item<span class="token punctuation">.</span>value<span class="token punctuation">)</span>
    

<span class="token keyword">import</span> urllib<span class="token punctuation">.</span>request
<span class="token keyword">import</span> http<span class="token punctuation">.</span>cookiejar
url <span class="token operator">=</span> <span class="token string">'https://www.zhihu.com/hot'</span>
headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'</span>
<span class="token punctuation">}</span>
cookie <span class="token operator">=</span> http<span class="token punctuation">.</span>cookiejar<span class="token punctuation">.</span>CookieJar<span class="token punctuation">(</span><span class="token punctuation">)</span>
handler <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>HTTPCookieProcessor<span class="token punctuation">(</span>cookie<span class="token punctuation">)</span>
opener <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>build_opener<span class="token punctuation">(</span>handler<span class="token punctuation">)</span>
request <span class="token operator">=</span> urllib<span class="token punctuation">.</span>request<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">,</span> method<span class="token operator">=</span><span class="token string">'GET'</span><span class="token punctuation">)</span>
r <span class="token operator">=</span> opener<span class="token punctuation">.</span>open<span class="token punctuation">(</span>request<span class="token punctuation">)</span>
html <span class="token operator">=</span> r<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>html<span class="token punctuation">)</span>
</code></pre>
<h3 id="2-1-2-处理异常"><a href="#2-1-2-处理异常" class="headerlink" title="2.1.2 处理异常"></a>2.1.2 处理异常</h3><p>urllib 的 error 模块定义了由 request 模块产生的异常 如果出现了问题，request 模块便会抛出 error 模块中定义的异常。</p>
<ol>
<li>URLError</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> urllib <span class="token keyword">import</span> request<span class="token punctuation">,</span> error
<span class="token keyword">try</span><span class="token punctuation">:</span>
    response <span class="token operator">=</span> request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span><span class="token string">'https://cuiqingcai.com/index.htm'</span><span class="token punctuation">)</span>
<span class="token keyword">except</span> error<span class="token punctuation">.</span>URLError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 打开一个不存在的页面照理来说应该会报错，但是这时我们捕获了 URLError 这个异常。输出：Not Found。通过如上操作，我们就可以避免程序异常终止，同时异常得到了有效处理。</span>
</code></pre>
<ol start="2">
<li>HTTPError</li>
</ol>
<p>URL Error 的子类，专门用来处理 HTTP 请求错误，比如认证请求失败等。</p>
<p>它有如下三个属性：</p>
<ul>
<li>code：返回 HTTP 状态码，比如 404 表示网页不存在，500 表示服务器内部错误等。</li>
<li>reason：同父类一样，用于返回错误的原因。</li>
<li>headers：返回请求头。</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> urllib <span class="token keyword">import</span> request<span class="token punctuation">,</span> error
<span class="token keyword">try</span><span class="token punctuation">:</span>
    response <span class="token operator">=</span> request<span class="token punctuation">.</span>urlopen<span class="token punctuation">(</span><span class="token string">'https://cuiqingcai.com/index.htm'</span><span class="token punctuation">)</span>
<span class="token keyword">except</span> error<span class="token punctuation">.</span>HTTPError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">,</span> e<span class="token punctuation">.</span>code<span class="token punctuation">,</span> e<span class="token punctuation">.</span>headers<span class="token punctuation">,</span> sep<span class="token operator">=</span><span class="token string">'\n'</span><span class="token punctuation">)</span>
<span class="token keyword">except</span> error<span class="token punctuation">.</span>URLError <span class="token keyword">as</span> e<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">.</span>reason<span class="token punctuation">)</span>
<span class="token keyword">else</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Request Successfully'</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 因为 URLError 是 HTTPError 的父类，所以可以先选择捕获子类的错误，再去捕获父类的错误。</span>
</code></pre>
<h3 id="2-1-3-解析链接"><a href="#2-1-3-解析链接" class="headerlink" title="2.1.3 解析链接"></a>2.1.3 解析链接</h3><pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># urlparse() 该方法可以实现 URL 的识别和分段</span>
<span class="token keyword">from</span> urllib<span class="token punctuation">.</span>parse <span class="token keyword">import</span> urlparse
result <span class="token operator">=</span> urlparse<span class="token punctuation">(</span><span class="token string">'http://www.baidu.com/index.html;user?id=5#comment'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>result<span class="token punctuation">)</span><span class="token punctuation">,</span> result<span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># scheme='http'                协议</span>
<span class="token comment" spellcheck="true"># netloc='www.baidu.com'    域名</span>
<span class="token comment" spellcheck="true"># path='/index.html'        访问路径</span>
<span class="token comment" spellcheck="true"># params='user'                参数</span>
<span class="token comment" spellcheck="true"># query='id=5'                查询条件</span>
<span class="token comment" spellcheck="true"># fragment='comment'        锚点</span>
<span class="token comment" spellcheck="true"># 标准的链接格式</span>
<span class="token comment" spellcheck="true"># scheme://netloc/path;params?query#fragment</span>
</code></pre>
<h2 id="2-2-使用-requests"><a href="#2-2-使用-requests" class="headerlink" title="2.2 使用 requests"></a>2.2 使用 requests</h2><h3 id="2-2-1-基本用法"><a href="#2-2-1-基本用法" class="headerlink" title="2.2.1 基本用法"></a>2.2.1 基本用法</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> requests
r <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.baidu.com'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>r<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>status_code<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>r<span class="token punctuation">.</span>text<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>cookies<span class="token punctuation">)</span>


<span class="token triple-quoted-string string">"""
对于 GET 请求，如果要附加额外的信息，一般怎样添加呢？比如现在想添加两个参数，
其中 name是germey, age是22 要构造这个请求链接
"""</span>
<span class="token keyword">import</span> requests

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'</span>
<span class="token punctuation">}</span>

data <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'germey'</span><span class="token punctuation">,</span>
    <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">22</span>
<span class="token punctuation">}</span>
url <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/get'</span><span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">,</span> data<span class="token operator">=</span>data<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>url<span class="token punctuation">.</span>text<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>url<span class="token punctuation">.</span>text<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>url<span class="token punctuation">.</span>json<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li>抓取网页、添加headers</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
上面的请求链接返回的是 JSON 形式的字符串，那么如果请求普通的网页，则肯定能获得相应的
内容了 下面以“知乎”→“发现”页面为例
"""</span>
<span class="token comment" spellcheck="true"># 在这里，如果不设置U-A，就不能正常请求</span>
headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span>
        <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'</span>
<span class="token punctuation">}</span>
r <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.zhihu.com/explore'</span><span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>text
pattern <span class="token operator">=</span> <span class="token string">'view-id="5799">(.*?)&lt;/a>'</span>
titles <span class="token operator">=</span> re<span class="token punctuation">.</span>findall<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> r<span class="token punctuation">,</span> re<span class="token punctuation">.</span>S<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># print(titles)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> titles<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>i<span class="token punctuation">)</span>
</code></pre>
<ul>
<li> 抓取二进制数据</li>
</ul>
<p>图片、音频、视频这些文件本质上都是由二进制码组成的，由于有特定的保存格式和对应的解析方式， 我们才可以看到这些形形色色的多媒体。所以，想要抓取它们，就要拿到它们的二进制码。</p>
<pre class=" language-python"><code class="language-python">url <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://github.com/favicon.ico'</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># print(url.text)  # 打印时转化为 str 类型，图片直接转化为字符串，会出现乱码</span>
<span class="token comment" spellcheck="true"># print(url.content)  # 前带有一个b</span>
<span class="token comment" spellcheck="true"># wb 以二进制写入打开一个文件。如果该文件已存在，则将其覆盖。</span>
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'favicon.ico'</span><span class="token punctuation">,</span> <span class="token string">'wb'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># content 是将网页数据以二进制输出</span>
    f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>url<span class="token punctuation">.</span>content<span class="token punctuation">)</span>
</code></pre>
<ul>
<li>post请求</li>
</ul>
<pre class=" language-python"><code class="language-python">data <span class="token operator">=</span> <span class="token punctuation">{</span><span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'germey'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">22</span><span class="token punctuation">}</span>
r <span class="token operator">=</span> requests<span class="token punctuation">.</span>post<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/post'</span><span class="token punctuation">,</span> data<span class="token operator">=</span>data<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>text<span class="token punctuation">)</span>

<span class="token triple-quoted-string string">"""
2．POST方式
网页的访问方式除了GET方式以外，还有POST方式。
有一些网页，使用GET和POST方式访问同样的网址，得到的结果是不一样的。
还有另外一些网页，只能使用POST方式访问，如果使用GET方式访问，网站会直接返回错误信息。
此时就需要使用requests的post()方法来获取源代码。
post()方法的格式如下：
import requests
data = {'key1': 'value1','key2': 'value2'}
html_formdata = requests.post('网址', data=data).content.decode() 
其中，data这个字典的内容和项数需要根据实际情况修改，Key和Value在不同的网站是不一样的。
而做爬虫，构造这个字典是任务之一。
还有一些网址，提交的内容需要是JSON格式的，因此post()方法的参数需要进行一些修改：
html_json = requests.post('网址', json=data).content.decode() 
#使用JSON提交数据。这样写代码，requests可以自动将字典转换为JSON字符串。
"""</span>

<span class="token comment" spellcheck="true"># 2.post方式的使用</span>
<span class="token comment" spellcheck="true"># 通过formdata提交数据</span>
data <span class="token operator">=</span> <span class="token punctuation">{</span><span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'ouni'</span><span class="token punctuation">,</span> <span class="token string">'password'</span><span class="token punctuation">:</span> <span class="token string">'123456'</span><span class="token punctuation">}</span>
html_formdata <span class="token operator">=</span> requests<span class="token punctuation">.</span>post<span class="token punctuation">(</span><span class="token string">'http://exercise.kingname.info/exercise_requests_post'</span><span class="token punctuation">,</span> data<span class="token operator">=</span>data<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>html_formdata<span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 通过JSON提交数据</span>
html_formdata <span class="token operator">=</span> requests<span class="token punctuation">.</span>post<span class="token punctuation">(</span><span class="token string">'http://exercise.kingname.info/exercise_requests_post'</span><span class="token punctuation">,</span> json<span class="token operator">=</span>data<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>html_formdata<span class="token punctuation">)</span>
</code></pre>
<h3 id="2-2-2-高级用法"><a href="#2-2-2-高级用法" class="headerlink" title="2.2.2 高级用法"></a>2.2.2 高级用法</h3><ol>
<li>文件上传</li>
</ol>
<p>requests 可以 拟提交一些数据，假如有的网站需要上传文件，我们也可以用它来实现。</p>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># favicon.ico 为文件名，此外还需注意路径。</span>
files <span class="token operator">=</span> <span class="token punctuation">{</span><span class="token string">'file'</span><span class="token punctuation">:</span> open<span class="token punctuation">(</span><span class="token string">'favicon.ico'</span><span class="token punctuation">,</span> <span class="token string">'rb'</span><span class="token punctuation">)</span><span class="token punctuation">}</span>
r <span class="token operator">=</span> requests<span class="token punctuation">.</span>post<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/post'</span><span class="token punctuation">,</span> files<span class="token operator">=</span>files<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>text<span class="token punctuation">)</span>
</code></pre>
<ol start="2">
<li>Cookies</li>
</ol>
<pre class=" language-python"><code class="language-python">r <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.baidu.com'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>cookies<span class="token punctuation">)</span>
<span class="token keyword">for</span> key<span class="token punctuation">,</span> value <span class="token keyword">in</span> r<span class="token punctuation">.</span>cookies<span class="token punctuation">.</span>items<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>key <span class="token operator">+</span> <span class="token string">'='</span> <span class="token operator">+</span> value<span class="token punctuation">)</span>
</code></pre>
<ol start="3">
<li>会话维持</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 普通方法</span>
<span class="token comment" spellcheck="true"># requests.get("http://httpbin.org/cookies/set/number/123456789")</span>
<span class="token comment" spellcheck="true"># r = requests.get('http://httpbin.org/cookies')</span>
<span class="token comment" spellcheck="true"># print(r.text)</span>

<span class="token comment" spellcheck="true"># 使用 Session</span>
s <span class="token operator">=</span> requests<span class="token punctuation">.</span>Session<span class="token punctuation">(</span><span class="token punctuation">)</span>
s<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">"http://httpbin.org/cookies/set/number/123456789"</span><span class="token punctuation">)</span>
r <span class="token operator">=</span> s<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'http://httpbin.org/cookies'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>r<span class="token punctuation">.</span>text<span class="token punctuation">)</span>
</code></pre>
<ol start="4">
<li>SSL 证书验证</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 使用 verify 参数控制是否检查此证书</span>
<span class="token keyword">from</span> requests<span class="token punctuation">.</span>packages <span class="token keyword">import</span> urllib3
urllib3<span class="token punctuation">.</span>disable_warnings<span class="token punctuation">(</span><span class="token punctuation">)</span>
response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.12306.cn'</span><span class="token punctuation">,</span> verify<span class="token operator">=</span><span class="token boolean">False</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>response<span class="token punctuation">.</span>status_code<span class="token punctuation">)</span>
</code></pre>
<ol start="5">
<li>代理设置</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">'''
对于某些网站，在测试的时候请求几次，能正常获取内容 但是一旦开始大规模爬取，对于大规模且频繁的请求，网站可能会弹出验证码，或者跳转到登录认证页面，更甚者可能会直接封禁客户端，导致一定时间段内无法访问。为了防止这种情况发生，需要设置代理来解决这个问题，这就需要用到 proxies 参数
'''</span>
<span class="token string">"使用 HTTP Basic Auth,可以使用类似 http://user:password@host:port 这样的语法来置代理，示例如下："</span>
<span class="token comment" spellcheck="true"># proxies = {</span>
<span class="token comment" spellcheck="true">#     'http': 'http://user:password@10.10.1.10:3128'</span>
<span class="token comment" spellcheck="true"># }</span>
<span class="token comment" spellcheck="true"># response = requests.get('https://www.taobao.com', proxies=proxies)</span>
<span class="token comment" spellcheck="true"># print(response)</span>


<span class="token comment" spellcheck="true"># pip install 'requests[socks]'</span>


<span class="token comment" spellcheck="true"># "除了基本的 HTTP 代理外， equests 还支持 SOCKS 协议的代理"</span>
<span class="token comment" spellcheck="true"># import requests</span>
<span class="token comment" spellcheck="true"># proxies = {</span>
<span class="token comment" spellcheck="true">#     'http': 'socks5://user:password@host:port',</span>
<span class="token comment" spellcheck="true">#     'https': 'socks5://user:password@host:port'</span>
<span class="token comment" spellcheck="true"># }</span>
<span class="token comment" spellcheck="true"># response = requests.get('https://www.taobao.com', proxies=proxies)</span>
<span class="token comment" spellcheck="true"># print(response)</span>
</code></pre>
<h2 id="2-3-正则表达式"><a href="#2-3-正则表达式" class="headerlink" title="2.3 正则表达式"></a>2.3 正则表达式</h2><h3 id="2-3-1-常用的匹配规则"><a href="#2-3-1-常用的匹配规则" class="headerlink" title="2.3.1 常用的匹配规则"></a>2.3.1 常用的匹配规则</h3><p><img src="C:\Users\Aunean\AppData\Roaming\Typora\typora-user-images\image-20201124104319661.png" alt="image-20201124104319661"></p>
<p>###2.3.2 match()</p>
<pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> re
content <span class="token operator">=</span> <span class="token string">'Hello 123 4567 World_This is a Regex Demo'</span>
<span class="token comment" spellcheck="true"># print(len(content))</span>
result <span class="token operator">=</span> re<span class="token punctuation">.</span>match<span class="token punctuation">(</span><span class="token string">'^Hello\s\d\d\d\s\d{4}\s\w{10}'</span><span class="token punctuation">,</span> content<span class="token punctuation">,</span> re<span class="token punctuation">.</span>S<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># print(result)</span>
<span class="token comment" spellcheck="true"># print(result.group())</span>
<span class="token comment" spellcheck="true"># print(result.span())</span>
</code></pre>
<ul>
<li>匹配目标</li>
</ul>
<p>刚才我们用 match() 方法可以得到匹配到的字符串内容，但是如果想从字符串中提 一部分内容，该怎么办呢？就像最前面的实例一样，从一段文本中提取出邮件或电话号码等内容。</p>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
1.使用match()方法进行匹配，如果在起始位置匹配成功，则返回Match对象，否则返回None，
语法格式：
re.match(pattern, string, [flags])
pattern：表示模式字符串，由要匹配的正则表达式转换而来
string：表示要匹配的字符串
flags：可选参数，表示标志位，用于控制匹配方式，如是否区分字母大小写。如下：
   标志                       说明
A或ASCII         对于\w \W \b \B \D \s \S 只进行ASCII匹配
I或IGNORECASE    执行不区分字母大小写的匹配
M或MULTILINE     将^和$用于包括整个字符串的开始和结尾的每一行
S或DOTALL        使用(.)字符匹配所有字符，包括换行符
X或VERBOSE       忽略模式字符串中未转义的空格和注释
"""</span>

pattern <span class="token operator">=</span> r<span class="token string">'mr_\w+'</span>
string <span class="token operator">=</span> <span class="token string">'MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>match<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">,</span> re<span class="token punctuation">.</span>I<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'匹配值的起始位置：'</span><span class="token punctuation">,</span> match<span class="token punctuation">.</span>start<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'匹配值的结束位置：'</span><span class="token punctuation">,</span> match<span class="token punctuation">.</span>end<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'匹配位置的元组：'</span><span class="token punctuation">,</span> match<span class="token punctuation">.</span>span<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'要匹配的字符串：'</span><span class="token punctuation">,</span> match<span class="token punctuation">.</span>string<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'匹配数据：'</span><span class="token punctuation">,</span> match<span class="token punctuation">.</span>group<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
string <span class="token operator">=</span> <span class="token string">'项目名称MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>match<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">,</span> re<span class="token punctuation">.</span>I<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>


<span class="token keyword">import</span> re
result <span class="token operator">=</span> re<span class="token punctuation">.</span>match<span class="token punctuation">(</span><span class="token string">'^Hello\s(.*?)\s\w{10} (.*?)\s'</span><span class="token punctuation">,</span> content<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;re.Match object; span=(0, 25), match='Hello 123 4567 World_This'></span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">.</span>group<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># Hello 123 4567 World_This</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">.</span>group<span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">' '</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 1234567</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">.</span>group<span class="token punctuation">(</span><span class="token number">2</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># is</span>
</code></pre>
<ul>
<li>修饰符</li>
</ul>
<p><img src="C:\Users\Aunean\AppData\Roaming\Typora\typora-user-images\image-20201124105453279.png" alt="image-20201124105453279"></p>
<h3 id="2-3-3-search"><a href="#2-3-3-search" class="headerlink" title="2.3.3 search()"></a>2.3.3 search()</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
search()方法用于整个字符串中搜索第一个匹配的值，如果匹配成功，则返回Match对象，
否则返回None。语法格式如下：
re.search(pattern, string, [flags])
"""</span>
pattern <span class="token operator">=</span> r<span class="token string">'mr_\w+'</span>
string <span class="token operator">=</span> <span class="token string">'MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>search<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">,</span> re<span class="token punctuation">.</span>I<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>
string <span class="token operator">=</span> <span class="token string">'项目名称MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>search<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">,</span> re<span class="token punctuation">.</span>I<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>
</code></pre>
<h3 id="2-3-4-findall"><a href="#2-3-4-findall" class="headerlink" title="2.3.4 findall()"></a>2.3.4 findall()</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
findall()方法用于在整个字符串中搜索所有符合正则表达式的字符串，并以列表的形式
返回。如果匹配成功，则返回包含匹配结构的列表，否则返回空列表。
re.findall(pattern, string, [flags])
"""</span>
pattern <span class="token operator">=</span> r<span class="token string">'mr_\w+'</span>
string <span class="token operator">=</span> <span class="token string">'MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>findall<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">,</span> re<span class="token punctuation">.</span>I<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>
string <span class="token operator">=</span> <span class="token string">'项目名称MR_SHOP mr_shop'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>findall<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> string<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>

pattern <span class="token operator">=</span> r<span class="token string">'([1-9]{1,3}(\.[0-9]{1,3}){3})'</span>
str1 <span class="token operator">=</span> <span class="token string">'127.0.0.1 192.168.1.66'</span>
match <span class="token operator">=</span> re<span class="token punctuation">.</span>findall<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> str1<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>match<span class="token punctuation">)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> match<span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>i<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
</code></pre>
<h3 id="2-3-5-sub"><a href="#2-3-5-sub" class="headerlink" title="2.3.5 sub()"></a>2.3.5 sub()</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
sub()
除了使用正则表达式提取信息外，有时候还需要借助它来修改文本,
比如，想要把一串文本中的所有数字都去掉，如果只用字符串的 replace()
方法，那就太烦琐了，这时可以借助 sub() 方法
"""</span>
content <span class="token operator">=</span> <span class="token string">'54aKS4yrsoiRS4ixSL2g数字'</span>
content <span class="token operator">=</span> re<span class="token punctuation">.</span>sub<span class="token punctuation">(</span><span class="token string">'\d+|[\u4e00-\u9fa5]'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">,</span> content<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>content<span class="token punctuation">)</span>
</code></pre>
<h1 id="第三章、解析库的使用"><a href="#第三章、解析库的使用" class="headerlink" title="第三章、解析库的使用"></a>第三章、解析库的使用</h1><h2 id="3-1-使用-XPath"><a href="#3-1-使用-XPath" class="headerlink" title="3.1 使用 XPath"></a>3.1 使用 XPath</h2><p>XPath 全称 XML Path Language ，即 XML 路径语言，它是一门在 XML 文档中查找信息的语言。它最初是用来搜寻 XML 文档的，但是它同样适用于 HTML 文档的搜索。</p>
<p><a target="_blank" rel="noopener" href="https://www.w3.org/TR/xpath">官方文档</a></p>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
XPath语句格式
         核心思想：写XPath就是写地址。
         获取文本：
//标签1[@属性1="属性值1"]/标签2[@属性2="属性值2"]/..../text()
获取属性值：
//标签1[@属性1="属性值1"]/标签2[@属性2="属性值2"]/..../@属性n
其中，[@属性="属性值"]不是必需的。它的作用是帮助过滤相同的标签。
在不需要过滤相同标签的情况下可以省略。

哪些属性可以省略
&lt;ul>标签本身就没有属性，则写XPath的时候，其属性可以省略。
标签有属性，但是如果这个标签的所有属性值都相同，则可以省略属性，
例如&lt;li class="info">，所有的&lt;li>标签都有一个class属性，值都为info，所以属性可以省略。

"""</span>
</code></pre>
<p>==<strong>对于 tboty 标签，实际匹配不到的，在写的时候需要注意</strong>==</p>
<h3 id="3-1-1-XPath-常用使用规则"><a href="#3-1-1-XPath-常用使用规则" class="headerlink" title="3.1.1 XPath 常用使用规则"></a>3.1.1 XPath 常用使用规则</h3><table>
<thead>
<tr>
<th>表达式</th>
<th>描述</th>
</tr>
</thead>
<tbody><tr>
<td>nodename</td>
<td>选取此节点的所有子节点</td>
</tr>
<tr>
<td>/</td>
<td>从当前节点选取直接子节点</td>
</tr>
<tr>
<td>//</td>
<td>从当前节点选取子孙节点</td>
</tr>
<tr>
<td>.</td>
<td>选取当前节点</td>
</tr>
<tr>
<td>..</td>
<td>选取当前节点的父节点</td>
</tr>
<tr>
<td>@</td>
<td>选取属性</td>
</tr>
</tbody></table>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 导入lxml库</span>
<span class="token keyword">from</span> lxml <span class="token keyword">import</span> etree

<span class="token comment" spellcheck="true"># 读取text.html文件并转化为元素树对象</span>
parse <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTMLParser<span class="token punctuation">(</span>encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
tree <span class="token operator">=</span> etree<span class="token punctuation">.</span>parse<span class="token punctuation">(</span><span class="token string">'text.html'</span><span class="token punctuation">,</span> parse<span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 补充xpath表达式,获取所有书的名称</span>
<span class="token comment" spellcheck="true"># ********** Begin ********* #</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//book/title/text()'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># *********** End ********** #</span>

<span class="token comment" spellcheck="true"># 补充xpath表达式,获取所有书的价格</span>
<span class="token comment" spellcheck="true"># ********** Begin ********* #</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//book/price/text()'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># *********** End ********** #</span>

<span class="token comment" spellcheck="true"># 填写代码, 获取价格低于30的书名</span>
<span class="token comment" spellcheck="true"># ********** Begin ********* #</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//book/title[@class="good"]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># *********** End ********** #</span>

<span class="token comment" spellcheck="true"># 相对路径 book 节点选择</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//book[1]/title/text()'</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 相对路径 title 节点存在 class 属性条件选择</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//title[@class]/@class'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 同上, 但是使用了轴选择 class 属性值</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//title[@class]/attribute::class'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 绝对路径常规选择</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'/html/body/bookstore//book[1]/title/@class'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 5获取价格小于30的书名</span>
book_name <span class="token operator">=</span> tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">"//book[price&lt;30]/title/text()"</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>book_name<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 6获取价格等于39.95的书</span>
book_name <span class="token operator">=</span> tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">"//book[price=39.95]/title/text()"</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>book_name<span class="token punctuation">)</span>
</code></pre>
<h3 id="3-1-2-所有节点"><a href="#3-1-2-所有节点" class="headerlink" title="3.1.2 所有节点"></a>3.1.2 所有节点</h3><p>例：html.xpath(‘//*’)</p>
<h3 id="3-1-3-子节点"><a href="#3-1-3-子节点" class="headerlink" title="3.1.3 子节点"></a>3.1.3 子节点</h3><p>例：html.xpath(‘//li/a’)</p>
<h3 id="3-1-4-子孙节点"><a href="#3-1-4-子孙节点" class="headerlink" title="3.1.4 子孙节点"></a>3.1.4 子孙节点</h3><p>例：html.xpath(‘//li//a’)</p>
<h3 id="3-1-5-父节点"><a href="#3-1-5-父节点" class="headerlink" title="3.1.5 父节点"></a>3.1.5 父节点</h3><p>选中 href 属性为 link4.html 节点，然后再获取其父节点，然后再获取其 class 属性，相关代码如下：</p>
<p>例：html.xpath(‘//a[@href=”link4.html”]/../@class’)</p>
<p>同时，也可以通过 parent:: 来获取父节点</p>
<p>html.xpath(‘//a[@href=”link4.html”]/parent::*/@class’)</p>
<h3 id="3-1-6-属性匹配"><a href="#3-1-6-属性匹配" class="headerlink" title="3.1.6 属性匹配"></a>3.1.6 属性匹配</h3><p>要选取 class 为 item-1 的 li节点，可以这样实现：</p>
<p>html.xpath(‘//li[@class=”itme-1”]’)</p>
<h3 id="3-1-7-文本获取"><a href="#3-1-7-文本获取" class="headerlink" title="3.1.7 文本获取"></a>3.1.7 文本获取</h3><p>html.xpath(‘//li[@class=”item-0”]/text()’)</p>
<h3 id="3-1-8属性获取"><a href="#3-1-8属性获取" class="headerlink" title="3.1.8属性获取"></a>3.1.8属性获取</h3><p>获取所有 li 节点下所有 节点的 href 属性</p>
<p>html.xpath(‘//li/a/@href’)</p>
<h3 id="3-1-9-属性多值匹配"><a href="#3-1-9-属性多值匹配" class="headerlink" title="3.1.9 属性多值匹配"></a>3.1.9 属性多值匹配</h3><pre class=" language-python"><code class="language-python">text <span class="token operator">=</span> <span class="token triple-quoted-string string">"""&lt;li class="li li-first">&lt;a href="link.html">first item&lt;/a>&lt;/li>"""</span>
html <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>text<span class="token punctuation">)</span>
<span class="token triple-quoted-string string">'''这里 HTML 文本中 li 节点的 class 属性有两个值 li li-first，
此时如果还想用之前的属性匹配获取，就无法匹配了'''</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[@class="li"]/a/text()'</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># result = html.xpath('//li[@class="li li-first"]/a/text()')</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 这时就需要用 contains() 函数</span>
<span class="token triple-quoted-string string">'''这样通过 contains()方法，第一个参数传人属性名称，第二个参数传人属性值，只要此属性包含
所传人的属性值，就可以完成匹配'''</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[contains(@class, "li")]/a/text()'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
</code></pre>
<h3 id="3-1-10-多属性匹配"><a href="#3-1-10-多属性匹配" class="headerlink" title="3.1.10 多属性匹配"></a>3.1.10 多属性匹配</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
多属性匹配
我们可能还遇到一种情况，那就是根据多个属性确定一个节点，这时就需要同时匹配多个
属性 此时可以使用运算符 and 来连接，
"""</span>
text <span class="token operator">=</span> <span class="token string">'&lt;li class="li li-first" name="item">&lt;a href="link.html">first item&lt;/a>&lt;/li>'</span>
html <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>text<span class="token punctuation">)</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[contains(@class, "li") and @name="item"]/a/text()'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
</code></pre>
<p><strong>运算符及其介绍</strong><img src="C:\Users\Aunean\AppData\Roaming\Typora\typora-user-images\image-20201124143721294.png" alt="image-20201124143721294"></p>
<h3 id="3-1-11-按序选择"><a href="#3-1-11-按序选择" class="headerlink" title="3.1.11 按序选择"></a>3.1.11 按序选择</h3><p> <a href="..........%5Clearning%5Cspider%5Cproject%5C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%5Ctext.html">text.html</a> </p>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
按序选择
有时候，我们在选择的时候某些属性可能同时匹配了多个节点，但是只想要其中的某个节点，如
第二个节点或者最后一个节点，
"""</span>
html <span class="token operator">=</span> etree<span class="token punctuation">.</span>parse<span class="token punctuation">(</span><span class="token string">'../text.html'</span><span class="token punctuation">,</span> etree<span class="token punctuation">.</span>HTMLParser<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/a/text()'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 选取第一个 li 节点</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[last()]/a/text()'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 最后一个 li 节点</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[position()&lt;4]/a/text()'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 选取位置小于四的节点</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[last()-2]/a/text()'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 倒数第三个节点</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
</code></pre>
<h3 id="3-1-12-节点轴选择"><a href="#3-1-12-节点轴选择" class="headerlink" title="3.1.12 节点轴选择"></a>3.1.12 节点轴选择</h3><p>XPath 提供了很多节点轴选择方法，包括获取子元素 、兄弟元素、父元素、祖先元素等，示例如下：</p>
<pre class=" language-python"><code class="language-python">html <span class="token operator">=</span> etree<span class="token punctuation">.</span>parse<span class="token punctuation">(</span><span class="token string">'../text.html'</span><span class="token punctuation">,</span> etree<span class="token punctuation">.</span>HTMLParser<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 1.调用 ancestor 轴，获取所有祖先节点。然后是节点的选择器，这里直接使用*，表示匹配所有节点</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/ancestor::*'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 2.改 * 为 div ，这样得到的结果只有div这个祖先节点了</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/ancestor::div'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 3.调用 attribute 轴，获取所有属性值。其后跟的选择器还是＊，这代</span>
<span class="token comment" spellcheck="true"># 获取节点的所有属性，返回值就是 li 节点的所有属性值</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/attribute::*'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 4.调用了 child 轴，可以获取所有直接子节点 这里我们又加了限定条件，选</span>
<span class="token comment" spellcheck="true"># href 属性为 link1.html 的a节点</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/child::a[@href="link1.html"]'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 5.调用 descendant 轴，可以获取所有子孙节点。这里有添加限定条件获取span节点，所以返回的结果只包含span节点</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/descendant::span'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 6.调用 following 轴，可以获取当前节点之后的所有节点。</span>
<span class="token comment" spellcheck="true"># 虽然这里使用*匹配，但又加了索引选择，所以只获取了第二个后续节点</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/following::*[2]'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 7.调用 following-sibling 轴，可以获取当前节点之后的所有同级节点。</span>
<span class="token comment" spellcheck="true"># 这里我们使用 * 匹配，所以获取了所有后续同级节点。</span>
result <span class="token operator">=</span> html<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//li[1]/following-sibling::*'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>result<span class="token punctuation">)</span>
</code></pre>
<p><a target="_blank" rel="noopener" href="http://www.w3school.com.cn/xpath/index.asp">更多xpath用法</a></p>
<h2 id="3-2-使用-Beautiful-Soup"><a href="#3-2-使用-Beautiful-Soup" class="headerlink" title="3.2 使用 Beautiful Soup"></a>3.2 使用 Beautiful Soup</h2><h3 id="3-2-1-基本用法"><a href="#3-2-1-基本用法" class="headerlink" title="3.2.1 基本用法"></a>3.2.1 基本用法</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
基本用法
"""</span>
html <span class="token operator">=</span> <span class="token triple-quoted-string string">'''&lt;html>&lt;head>&lt;title>The Dormouse's story&lt;/title>&lt;/head> 
&lt;body> 
&lt;p class="title" name="dromouse">&lt;b>The Dormouse's story&lt;/b>&lt;/p> 
&lt;p class="story">Once upon a time there were three little sisters; and their names were 
&lt;a href="http://example.com/elsie" class="sister" id="link1">&lt;!-- Elsie -->&lt;/a>, 
&lt;a href="http://example.com/lacie" class="sister" id="link2">Lacie&lt;/a> and 
&lt;a href="http://example.com/tillie" class="sister" id="link3">Tillie&lt;/a>; 
and they lived at the bottom of a well.&lt;/p> 
&lt;p class="story">...&lt;/p>
'''</span>
<span class="token string">'对于不标准的 HTML 字符串, BeautifulSoup 可以自动更正格式'</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token string">'调用 prettify()方法 这个方法可以把要解析的字符串以标准的缩进格式输出'</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>prettify<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>prettify<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>title<span class="token punctuation">.</span>string<span class="token punctuation">)</span>
</code></pre>
<h3 id="3-2-2-节点选择器"><a href="#3-2-2-节点选择器" class="headerlink" title="3.2.2 节点选择器"></a>3.2.2 节点选择器</h3><pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
节点选择器
直接调用节点的名称就可以选择节点元素，再调用 string 属性就可以得到节点内的文本了，这种
选择方式速度非常快。如果单个节点结构层次非常清晰，可以选用这种方式来解析
"""</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>title<span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;class 'bs4.element.Tag'></span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>title<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;title>The Dormouse's story&lt;/title></span>
<span class="token comment" spellcheck="true"># 获取名称。利用 name 属性获取节点的名称</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>title<span class="token punctuation">.</span>name<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>head<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;p class="title" name="dromouse">&lt;b>The Dormouse's story&lt;/b>&lt;/p></span>
<span class="token string">'attrs 的返回结果是字典形式，它把选择的节点的所有属性和属性值组合成一个字典'</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>attrs<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># {'class': ['title'], 'name': 'dromouse'}</span>
<span class="token triple-quoted-string string">'''这里需要注意的是，有的返回结果是字符串，有的返回结果是字符串组成的列表 比如， 
name属性的值是唯一的，返回的结果就是单个字符串,而对于 class 一个节点元素可能有多个 class 所以
返回的是列表'''</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>attrs<span class="token punctuation">[</span><span class="token string">'name'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># dromouse</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>attrs<span class="token punctuation">[</span><span class="token string">'class'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># ['title']</span>

<span class="token comment" spellcheck="true"># 获取内容。可以利用 string 属性获取节点元素包含的文本内容</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>title<span class="token punctuation">.</span>string<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># The Dormouse's story</span>
</code></pre>
<h3 id="3-2-3-关联选择"><a href="#3-2-3-关联选择" class="headerlink" title="3.2.3 关联选择"></a>3.2.3 关联选择</h3><ol>
<li>子节点和子孙节点</li>
</ol>
<pre class=" language-python"><code class="language-python">html <span class="token operator">=</span> <span class="token triple-quoted-string string">'''&lt;html>
 &lt;head>
  &lt;title>The Dormouse's story&lt;/title>
 &lt;/head>
 &lt;body>
  &lt;p class="story">
   Once upon a time there were three little sisters; and their names were
   &lt;a  href="http://example.com/elsie" class="sister" id="link1">
    &lt;span>Elsie&lt;/span>
   &lt;/a>
   &lt;a href="http://example.com/lacie" class="sister" id="link2">Lacie&lt;/a>
    and
   &lt;a href="http://example.com/tillie" class="sister" id="link3">Tillie&lt;/a>
and they lived at the bottom of a well.
  &lt;/p>
  &lt;p class="story">...&lt;/p>
'''</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>contents<span class="token punctuation">)</span>

<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>children<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 得到p标签下的子节点</span>
<span class="token keyword">for</span> i<span class="token punctuation">,</span> child <span class="token keyword">in</span> enumerate<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>children<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>i<span class="token punctuation">,</span> child<span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 如果要得到所有的子孙节点的话，可以调用 descendants 属性</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>descendants<span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;generator object Tag.descendants at 0x00000143C3039EC8></span>
<span class="token keyword">for</span> i<span class="token punctuation">,</span> child <span class="token keyword">in</span> enumerate<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>p<span class="token punctuation">.</span>descendants<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>i<span class="token punctuation">,</span> child<span class="token punctuation">)</span>
</code></pre>
<ol start="2">
<li>父节点和祖先节点</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
父节点和祖先节点
如果要获取某个节点元素的父节点，可以调用 parent 属性
"""</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>parent<span class="token punctuation">)</span>

soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>parents<span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;class 'generator'></span>
<span class="token keyword">print</span><span class="token punctuation">(</span>list<span class="token punctuation">(</span>enumerate<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>parents<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
</code></pre>
<ol start="3">
<li>兄弟节点</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Next Sibling'</span><span class="token punctuation">,</span> soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>next_sibling<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Prev Sibling'</span><span class="token punctuation">,</span> soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>previous_sibling<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Next Siblings'</span><span class="token punctuation">,</span> list<span class="token punctuation">(</span>enumerate<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>next_siblings<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Prev Siblings'</span><span class="token punctuation">,</span> list<span class="token punctuation">(</span>enumerate<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>a<span class="token punctuation">.</span>previous_siblings<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># next_sibling 和 previous_sibling 分别获取节点的下一个和上一个兄弟元素，</span>
<span class="token comment" spellcheck="true"># next_siblings 和 previous_siblings 分别返回所有前面和后面的兄弟节点的生成器。</span>
</code></pre>
<h3 id="3-2-4-方法选择器"><a href="#3-2-4-方法选择器" class="headerlink" title="3.2.4 方法选择器"></a>3.2.4 方法选择器</h3><ul>
<li>find_all()</li>
</ul>
<p>查询所有符合条件的元素</p>
<p>API 如下：</p>
<p>find_all(name , attrs , recursive , text , **kwargs) </p>
<ol>
<li>name。根据节点名来查询元素</li>
</ol>
<pre class=" language-python"><code class="language-python">html <span class="token operator">=</span> <span class="token triple-quoted-string string">'''&lt;div class="panel">
&lt;div class="panel-heading">
&lt;h4>Hello&lt;/h4> 
&lt;/div> 
&lt;div class="panel-body"> 
&lt;li class="list" id="list-1">
&lt;li class="element">Foo&lt;/li> 
&lt;li class="element">Bar&lt;/li> 
&lt;li class="element">Jay&lt;/li> 
&lt;ul>
&lt;li class="element">Fool&lt;/li> 
&lt;/ul> 
&lt;ul class="list list-small" id="list-2">
&lt;li class="element">Foo&lt;/li> 
&lt;li class="element">Bar&lt;/li> 
&lt;/ul> 
&lt;/div> 
&lt;/div>
'''</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>find<span class="token punctuation">(</span>name<span class="token operator">=</span><span class="token string">'ul'</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;class 'bs4.element.Tag'></span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>name<span class="token operator">=</span><span class="token string">'ul'</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;class 'bs4.element.ResultSet'></span>
<span class="token keyword">print</span><span class="token punctuation">(</span>type<span class="token punctuation">(</span>soup<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>name<span class="token operator">=</span><span class="token string">'ul'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># &lt;class 'bs4.element.Tag'></span>
<span class="token keyword">for</span> ul <span class="token keyword">in</span> soup<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>name<span class="token operator">=</span><span class="token string">'ul'</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># print(ul.find_all(name='li'))</span>
    <span class="token keyword">for</span> li <span class="token keyword">in</span> ul<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>name<span class="token operator">=</span><span class="token string">'li'</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>li<span class="token punctuation">.</span>string<span class="token punctuation">)</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>li<span class="token punctuation">[</span><span class="token string">'class'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>li<span class="token punctuation">)</span>
</code></pre>
<ol start="2">
<li>attrs。根据属性来查询。</li>
</ol>
<pre class=" language-python"><code class="language-python">html <span class="token operator">=</span> <span class="token triple-quoted-string string">'''&lt;div class="panel">
&lt;div class="panel-heading">
&lt;h4>Hello&lt;/h4> 
&lt;/div> 
&lt;div class="panel-body"> 
&lt;ul class="list" id="list-1" name="elements">
&lt;li class="element">Foo&lt;/li> 
&lt;li class="element">Bar&lt;/li> 
&lt;li class="element">Jay&lt;/li> 
&lt;/ul>
&lt;ul class="list list-small" id="list-2">
&lt;li class="element">Foo&lt;/li> 
&lt;li class="element">Bar&lt;/li> 
&lt;/ul> 
&lt;/div> 
&lt;/div>
'''</span>
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>attrs<span class="token operator">=</span><span class="token punctuation">{</span><span class="token string">'id'</span><span class="token punctuation">:</span> <span class="token string">'list-1'</span><span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># print(soup.find_all(id='list-1'))</span>
<span class="token comment" spellcheck="true"># print(soup.find_all(attrs={'name': 'elements'}))</span>
<span class="token comment" spellcheck="true"># print(type(soup.find_all(attrs={'name': 'elements'})))  # &lt;class 'bs4.element.ResultSet'></span>
</code></pre>
<ol start="3">
<li>text。匹配节点的文本，传入的形式可以是字符串，可以是正则表达式对象。</li>
</ol>
<pre class=" language-python"><code class="language-python"><span class="token triple-quoted-string string">"""
text
"""</span>
html <span class="token operator">=</span> <span class="token triple-quoted-string string">''' 
&lt;div class="panel"> &lt;div class="panel-body">
&lt;a>Hello, this is a link&lt;/a> 
&lt;a>Hello, this is a link, too&lt;/a> 
&lt;/div> 
&lt;/div>
'''</span>
<span class="token triple-quoted-string string">"""这里有两个a节点，其内部包含文本信息。这里在find_all()方法 传人 text 参数 该参数为正
则表达式对象，结果返回所有匹配正则表达式的节点文本组成的列表。"""</span>
<span class="token keyword">import</span> re
soup <span class="token operator">=</span> BeautifulSoup<span class="token punctuation">(</span>html<span class="token punctuation">,</span> <span class="token string">'lxml'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>soup<span class="token punctuation">.</span>find_all<span class="token punctuation">(</span>text<span class="token operator">=</span>re<span class="token punctuation">.</span>compile<span class="token punctuation">(</span><span class="token string">'link'</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># ['Hello, this is a link', 'Hello, this is a link, too']</span>
</code></pre>
<ul>
<li>find()。返回的是单个元素 ，也就是第一个匹配的元素。</li>
</ul>
<h1 id="第四章、数据存储"><a href="#第四章、数据存储" class="headerlink" title="第四章、数据存储"></a>第四章、数据存储</h1><p><img src="C:\Users\14533\AppData\Roaming\Typora\typora-user-images\image-20210923104926220.png"></p>
<h2 id="4-1-文件存储"><a href="#4-1-文件存储" class="headerlink" title="4.1 文件存储"></a>4.1 文件存储</h2><h3 id="4-1-1-TXT-文本存储"><a href="#4-1-1-TXT-文本存储" class="headerlink" title="4.1.1 TXT 文本存储"></a>4.1.1 TXT 文本存储</h3><blockquote>
<pre class=" language-python"><code class="language-python">使用Python打开文件，有两种写法。
第<span class="token number">1</span>种方式如下：
f <span class="token operator">=</span> open<span class="token punctuation">(</span><span class="token string">'文件路径'</span><span class="token punctuation">,</span> <span class="token string">'文件操作方式'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
对文件进行操作
f<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>

第<span class="token number">2</span>种方式，使用Python的上下文管理器：
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'文件路径'</span><span class="token punctuation">,</span> <span class="token string">'文件操作方式'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
  对文件进行操作

第<span class="token number">1</span>种方式需要手动关闭文件，但是在程序开发中经常会出现忘记关闭文件的情况。
第<span class="token number">2</span>种方法不需要手动关闭文件，只要代码退出了缩进，Python就会自动关闭文件。 
</code></pre>
</blockquote>
<pre class=" language-python"><code class="language-python">使用Python写文本文件
    使用Python写文件也需要先打开文件，使用如下代码来打开文件：
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'new.txt'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
</code></pre>
<pre class=" language-python"><code class="language-python"><span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'../text.txt'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    content_list <span class="token operator">=</span> f<span class="token punctuation">.</span>readlines<span class="token punctuation">(</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 读取所有行，以列表形式返回结果</span>
    <span class="token comment" spellcheck="true"># content_list = f.read()  # 直接把文件里面的全部内容用一个字符串返回</span>
    <span class="token comment" spellcheck="true"># content_list = f.readline()  # 读取一行数据</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>content_list<span class="token punctuation">)</span>

<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'../text2.txt'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    f<span class="token punctuation">.</span>write<span class="token punctuation">(</span><span class="token string">"hello,world\n"</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># f.write("hello,python")</span>
    f<span class="token punctuation">.</span>writelines<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'\n第一段\n'</span><span class="token punctuation">,</span> <span class="token string">'第二段'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
</code></pre>
<h3 id="4-1-2-JSON-文件存储"><a href="#4-1-2-JSON-文件存储" class="headerlink" title="4.1.2 JSON 文件存储"></a>4.1.2 JSON 文件存储</h3><pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 调用 JSON 库的 loads() 方法将 JSON 文本字符串转为 JSON 对象。</span>
<span class="token comment" spellcheck="true"># 通过 dumps() 方法将 JSON 对象转为文本字符串。</span>

<span class="token keyword">import</span> json
data <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">{</span>
    <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'王伟'</span><span class="token punctuation">,</span>
    <span class="token string">'gender'</span><span class="token punctuation">:</span> <span class="token string">'男'</span><span class="token punctuation">,</span>
    <span class="token string">'birthday'</span><span class="token punctuation">:</span> <span class="token string">'1992-10-18'</span><span class="token punctuation">}</span><span class="token punctuation">]</span>
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'data3.json'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> file<span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># 这里为了输出中文，要指定参数 ensure_ascii 为 false</span>
    file<span class="token punctuation">.</span>write<span class="token punctuation">(</span>json<span class="token punctuation">.</span>dumps<span class="token punctuation">(</span>data<span class="token punctuation">,</span> indent<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span> ensure_ascii<span class="token operator">=</span><span class="token boolean">False</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</code></pre>
<h3 id="4-1-3-CSV-文件存储"><a href="#4-1-3-CSV-文件存储" class="headerlink" title="4.1.3 CSV 文件存储"></a>4.1.3 CSV 文件存储</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> csv
data <span class="token operator">=</span> <span class="token punctuation">[</span>
    <span class="token punctuation">{</span><span class="token string">'id'</span><span class="token punctuation">:</span> <span class="token number">1001</span><span class="token punctuation">,</span> <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'Mike1'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">10</span><span class="token punctuation">}</span><span class="token punctuation">,</span>
    <span class="token punctuation">{</span><span class="token string">'id'</span><span class="token punctuation">:</span> <span class="token number">1002</span><span class="token punctuation">,</span> <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'Mike2'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">20</span><span class="token punctuation">}</span><span class="token punctuation">,</span>
    <span class="token punctuation">{</span><span class="token string">'id'</span><span class="token punctuation">:</span> <span class="token number">1003</span><span class="token punctuation">,</span> <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'子'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">30</span><span class="token punctuation">}</span><span class="token punctuation">,</span>
    <span class="token punctuation">{</span><span class="token string">'id'</span><span class="token punctuation">:</span> <span class="token number">1003</span><span class="token punctuation">,</span> <span class="token string">'name'</span><span class="token punctuation">:</span> <span class="token string">'Mike3'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">:</span> <span class="token number">30</span><span class="token punctuation">}</span>
<span class="token punctuation">]</span>
<span class="token comment" spellcheck="true"># newline=''不空行</span>
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'data3.csv'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> newline<span class="token operator">=</span><span class="token string">''</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    fieldnames <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'id'</span><span class="token punctuation">,</span> <span class="token string">'name'</span><span class="token punctuation">,</span> <span class="token string">'age'</span><span class="token punctuation">]</span>
    writer <span class="token operator">=</span> csv<span class="token punctuation">.</span>DictWriter<span class="token punctuation">(</span>f<span class="token punctuation">,</span> fieldnames<span class="token operator">=</span>fieldnames<span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writeheader<span class="token punctuation">(</span><span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writerows<span class="token punctuation">(</span>data<span class="token punctuation">)</span>
</code></pre>
<p><img src="https://cdn.jsdelivr.net/gh/Aunean-ls/pic/img/image-20210923152652422.png" alt="image-20210923152652422"></p>
<h2 id="4-2-关系型数据库存储"><a href="#4-2-关系型数据库存储" class="headerlink" title="4.2 关系型数据库存储"></a>4.2 关系型数据库存储</h2><h3 id="4-2-1-MySQL-的存储"><a href="#4-2-1-MySQL-的存储" class="headerlink" title="4.2.1 MySQL 的存储"></a>4.2.1 MySQL 的存储</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">def</span> <span class="token function">insert_data_mysql</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># 1.连接数据库</span>
    conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'qaz3357375'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">,</span> charset<span class="token operator">=</span><span class="token string">'utf8'</span><span class="token punctuation">)</span>
    cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
    
    <span class="token comment" spellcheck="true"># 创建数据库</span>
    <span class="token comment" spellcheck="true"># cursor.execute('CREATE DATABASE if not exists spiders DEFAULT CHARACTER SET utf8')</span>
    <span class="token comment" spellcheck="true"># cursor.execute('use spiders')</span>
    
    <span class="token comment" spellcheck="true"># 2.创建表</span>
    sql <span class="token operator">=</span> <span class="token string">'create table if not exists zhaopin(position varchar(235), '</span> \
          <span class="token string">'salary varchar(235), '</span> \
          <span class="token string">'work_place varchar(235), '</span> \
          <span class="token string">'Number_recruiters varchar(235), '</span> \
          <span class="token string">'operating_duty varchar(235), '</span> \
          <span class="token string">'requirement varchar(235), '</span> \
          <span class="token string">'Release_time varchar(235)'</span> \
          <span class="token string">')charset utf8'</span>

    cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># 3.插入数据</span>
    table <span class="token operator">=</span> <span class="token string">'表名'</span>
    keys <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>data<span class="token punctuation">.</span>keys<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    values <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'%s'</span><span class="token punctuation">]</span> <span class="token operator">*</span> len<span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">)</span>
    sql <span class="token operator">=</span> <span class="token string">'insert into {table}({keys}) values({values})'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>table<span class="token operator">=</span>table<span class="token punctuation">,</span> keys<span class="token operator">=</span>keys<span class="token punctuation">,</span> values<span class="token operator">=</span>values<span class="token punctuation">)</span>
    <span class="token keyword">try</span><span class="token punctuation">:</span>
        <span class="token keyword">if</span> cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">,</span> tuple<span class="token punctuation">(</span>data<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Successful'</span><span class="token punctuation">)</span>
            db<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
    <span class="token keyword">except</span><span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Failed'</span><span class="token punctuation">)</span>
        db<span class="token punctuation">.</span>rollback<span class="token punctuation">(</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># 关闭连接</span>
    db<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>




<span class="token comment" spellcheck="true"># 数据导入</span>
conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'qaz3357375'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">,</span> local_infile<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>
cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>

sql <span class="token operator">=</span> <span class="token string">'create table if not exists house_data('</span> \
<span class="token string">'title varchar(1235), '</span> \
<span class="token string">'type varchar(1235), '</span> \
<span class="token string">'area varchar(1235), '</span> \
<span class="token string">'price_square_meter '</span> \
<span class="token string">'varchar(1235), '</span> \
<span class="token string">'total_price varchar(1235), '</span> \
<span class="token string">'orientation varchar(1235), '</span> \
<span class="token string">'floor varchar(1235), '</span> \
<span class="token string">'address varchar(1235)'</span> \
<span class="token string">')charset utf8'</span>
cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
<span class="token keyword">try</span><span class="token punctuation">:</span>
    sql <span class="token operator">=</span> <span class="token string">"load data local infile '路径/文件名.csv' into table 表名 fields terminated by ',' lines terminated by '\n' ignore 1 lines"</span>
    cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
    conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'成功'</span><span class="token punctuation">)</span>
    <span class="token keyword">except</span><span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'失败'</span><span class="token punctuation">)</span>

conn<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
</code></pre>
<h2 id="4-3-非关系型数据库存储"><a href="#4-3-非关系型数据库存储" class="headerlink" title="4.3 非关系型数据库存储"></a>4.3 非关系型数据库存储</h2><h3 id="4-3-1-MongoDB-存储"><a href="#4-3-1-MongoDB-存储" class="headerlink" title="4.3.1 MongoDB 存储"></a>4.3.1 MongoDB 存储</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">def</span> <span class="token function">insert_data_mongodb</span><span class="token punctuation">(</span>data_list<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># 1.连接MongoDB</span>
    client <span class="token operator">=</span> pymongo<span class="token punctuation">.</span>MongoClient<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">27017</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true">#  指定数据库</span>
    db <span class="token operator">=</span> client<span class="token punctuation">[</span><span class="token string">'数据库名'</span><span class="token punctuation">]</span>
    <span class="token comment" spellcheck="true"># 指定集合</span>
    collection <span class="token operator">=</span> db<span class="token punctuation">[</span><span class="token string">'集合名'</span><span class="token punctuation">]</span>

    <span class="token keyword">for</span> i <span class="token keyword">in</span> data<span class="token punctuation">:</span>
        collection<span class="token punctuation">.</span>insert_one<span class="token punctuation">(</span>i<span class="token punctuation">)</span>
        
<span class="token keyword">def</span> <span class="token function">insert_data_mongo</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># 1.连接MongoDB</span>
    client <span class="token operator">=</span> pymongo<span class="token punctuation">.</span>MongoClient<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">27017</span><span class="token punctuation">)</span>

    <span class="token comment" spellcheck="true"># 2.指定数据库</span>
    db <span class="token operator">=</span> client<span class="token punctuation">[</span><span class="token string">'test'</span><span class="token punctuation">]</span>

    <span class="token comment" spellcheck="true"># 3.指定集合</span>
    collection <span class="token operator">=</span> db<span class="token punctuation">[</span><span class="token string">'zhaopin'</span><span class="token punctuation">]</span>

    <span class="token comment" spellcheck="true"># 4.插入多条数据</span>
    collection<span class="token punctuation">.</span>insert_many<span class="token punctuation">(</span>data<span class="token punctuation">)</span>
</code></pre>
<h1 id="第五章、Ajax数据爬取"><a href="#第五章、Ajax数据爬取" class="headerlink" title="第五章、Ajax数据爬取"></a>第五章、Ajax数据爬取</h1><h1 id="第六章、动态渲染页面爬取"><a href="#第六章、动态渲染页面爬取" class="headerlink" title="第六章、动态渲染页面爬取"></a>第六章、动态渲染页面爬取</h1><h2 id="6-1-Selenium-的使用"><a href="#6-1-Selenium-的使用" class="headerlink" title="6.1 Selenium 的使用"></a>6.1 Selenium 的使用</h2><pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
<span class="token keyword">from</span> selenium<span class="token punctuation">.</span>webdriver<span class="token punctuation">.</span>common<span class="token punctuation">.</span>by <span class="token keyword">import</span> By
<span class="token keyword">from</span> selenium<span class="token punctuation">.</span>webdriver<span class="token punctuation">.</span>common<span class="token punctuation">.</span>keys <span class="token keyword">import</span> Keys
<span class="token keyword">from</span> selenium<span class="token punctuation">.</span>webdriver<span class="token punctuation">.</span>support <span class="token keyword">import</span> expected_conditions <span class="token keyword">as</span> EC
<span class="token keyword">from</span> selenium<span class="token punctuation">.</span>webdriver<span class="token punctuation">.</span>support<span class="token punctuation">.</span>wait <span class="token keyword">import</span> WebDriverWait

browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token keyword">try</span><span class="token punctuation">:</span>
    browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.baidu.com'</span><span class="token punctuation">)</span>
    input <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_id<span class="token punctuation">(</span><span class="token string">'kw'</span><span class="token punctuation">)</span>
    input<span class="token punctuation">.</span>send_keys<span class="token punctuation">(</span><span class="token string">'Python'</span><span class="token punctuation">)</span>
    input<span class="token punctuation">.</span>send_keys<span class="token punctuation">(</span>Keys<span class="token punctuation">.</span>ENTER<span class="token punctuation">)</span>
    wait <span class="token operator">=</span> WebDriverWait<span class="token punctuation">(</span>browser<span class="token punctuation">,</span> <span class="token number">100</span><span class="token punctuation">)</span>
    wait<span class="token punctuation">.</span>until<span class="token punctuation">(</span>EC<span class="token punctuation">.</span>presence_of_element_located<span class="token punctuation">(</span><span class="token punctuation">(</span>By<span class="token punctuation">.</span>ID<span class="token punctuation">,</span> <span class="token string">'content_left'</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>browser<span class="token punctuation">.</span>current_url<span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>browser<span class="token punctuation">.</span>get_cookies<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span>browser<span class="token punctuation">.</span>page_source<span class="token punctuation">)</span>
<span class="token keyword">except</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'Error'</span><span class="token punctuation">)</span>
<span class="token keyword">finally</span><span class="token punctuation">:</span>
    browser<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li>访问页面</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.taobao.com'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>browser<span class="token punctuation">.</span>page_source<span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li>查找节点</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 单节点</span>
<span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.taobao.com'</span><span class="token punctuation">)</span>
input_first <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_id<span class="token punctuation">(</span><span class="token string">'q'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true"># 根据 id 值查找</span>
input_second <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_css_selector<span class="token punctuation">(</span><span class="token string">'#q'</span><span class="token punctuation">)</span>
input_third <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="q"]'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>input_first<span class="token punctuation">,</span> input_second<span class="token punctuation">,</span> input_third<span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>

<span class="token comment" spellcheck="true"># 多节点</span>
<span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token string">'https://www.taobao.com'</span><span class="token punctuation">)</span>
lis <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_elements_by_xpath<span class="token punctuation">(</span><span class="token string">'/html/body/div[4]/div[1]/div/div[1]/div/ul/li'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>lis<span class="token punctuation">)</span>
browser<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
</code></pre>
<ul>
<li>获取属性</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># 获取属性</span>
<span class="token keyword">import</span> requests
<span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
<span class="token keyword">from</span> selenium<span class="token punctuation">.</span>webdriver <span class="token keyword">import</span> ActionChains
browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
url <span class="token operator">=</span> <span class="token string">'https://www.zhihu.com/explore'</span>
browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">)</span>
logo <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="special"]/div[2]/div/div[2]/a/img'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>logo<span class="token punctuation">)</span>
src <span class="token operator">=</span> logo<span class="token punctuation">.</span>get_attribute<span class="token punctuation">(</span><span class="token string">'src'</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># print(src)</span>
tp <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>src<span class="token punctuation">)</span><span class="token punctuation">.</span>content
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'图片.png'</span><span class="token punctuation">,</span> <span class="token string">'wb'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>tp<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># browser.close()</span>
</code></pre>
<ul>
<li>获取文本</li>
</ul>
<pre class=" language-python"><code class="language-python"><span class="token keyword">from</span> selenium <span class="token keyword">import</span> webdriver
browser <span class="token operator">=</span> webdriver<span class="token punctuation">.</span>Chrome<span class="token punctuation">(</span><span class="token punctuation">)</span>
url <span class="token operator">=</span> <span class="token string">'https://www.zhihu.com/explore'</span>
browser<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">)</span>
input <span class="token operator">=</span> browser<span class="token punctuation">.</span>find_element_by_xpath<span class="token punctuation">(</span><span class="token string">'/html/body/div[1]/div/main/div[2]/div[3]/div[2]/div/div[3]/div[2]/div[1]/a'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>input<span class="token punctuation">.</span>text<span class="token punctuation">)</span>
</code></pre>
<h1 id="案例"><a href="#案例" class="headerlink" title="案例"></a>案例</h1><h2 id="案例-1"><a href="#案例-1" class="headerlink" title="案例"></a>案例</h2><h3 id="1-使用框架爬取猫眼top100的信息"><a href="#1-使用框架爬取猫眼top100的信息" class="headerlink" title="1. 使用框架爬取猫眼top100的信息"></a>1. 使用框架爬取猫眼top100的信息</h3><pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># maoyan.py 这里是文件名</span>
<span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>
<span class="token keyword">import</span> scrapy
<span class="token keyword">from</span> maoyanPro<span class="token punctuation">.</span>items <span class="token keyword">import</span> MaoyanproItem


<span class="token keyword">class</span> <span class="token class-name">MaoyanSpider</span><span class="token punctuation">(</span>scrapy<span class="token punctuation">.</span>Spider<span class="token punctuation">)</span><span class="token punctuation">:</span>
    name <span class="token operator">=</span> <span class="token string">'maoyan'</span>
    <span class="token comment" spellcheck="true"># allowed_domains = ['https://maoyan.com/board/4?offset=0']</span>
    start_urls <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'https://maoyan.com/board/4?offset=0'</span><span class="token punctuation">]</span>

    <span class="token keyword">def</span> <span class="token function">parse</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> response<span class="token punctuation">)</span><span class="token punctuation">:</span>
        dd_list <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="app"]/div/div/div[1]/dl/dd'</span><span class="token punctuation">)</span>
        <span class="token comment" spellcheck="true"># print(dd_list)</span>
        <span class="token keyword">for</span> dd <span class="token keyword">in</span> dd_list<span class="token punctuation">:</span>
            item <span class="token operator">=</span> MaoyanproItem<span class="token punctuation">(</span><span class="token punctuation">)</span>
            item<span class="token punctuation">[</span><span class="token string">"title"</span><span class="token punctuation">]</span> <span class="token operator">=</span> dd<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./a/@title'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
            item<span class="token punctuation">[</span><span class="token string">"actor"</span><span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">(</span><span class="token string">''</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>dd<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./div//p[@class="star"]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">3</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
            item<span class="token punctuation">[</span><span class="token string">"time"</span><span class="token punctuation">]</span> <span class="token operator">=</span> dd<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./div//p[@class="releasetime"]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">5</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
            item<span class="token punctuation">[</span><span class="token string">"score"</span><span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token string">''</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>dd<span class="token punctuation">.</span>css<span class="token punctuation">(</span><span class="token string">'p.score i::text'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
            <span class="token keyword">yield</span> item

        next_url <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="app"]//div[2]/ul/li[last()]/a/@href'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">if</span> next_url <span class="token operator">!=</span> <span class="token string">'javascript:void(0);'</span><span class="token punctuation">:</span>
            next_url <span class="token operator">=</span> <span class="token string">'https://maoyan.com/board/4'</span> <span class="token operator">+</span> next_url

            <span class="token keyword">yield</span> scrapy<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>next_url<span class="token punctuation">,</span> callback<span class="token operator">=</span>self<span class="token punctuation">.</span>parse<span class="token punctuation">)</span>

            
            
         
        
<span class="token comment" spellcheck="true"># items.py 这里是文件名            </span>
<span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>

<span class="token comment" spellcheck="true"># Define here the models for your scraped items</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true"># See documentation in:</span>
<span class="token comment" spellcheck="true"># https://docs.scrapy.org/en/latest/topics/items.html</span>

<span class="token keyword">import</span> scrapy


<span class="token keyword">class</span> <span class="token class-name">MaoyanproItem</span><span class="token punctuation">(</span>scrapy<span class="token punctuation">.</span>Item<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># define the fields for your item here like:</span>
    <span class="token comment" spellcheck="true"># name = scrapy.Field()</span>
    title <span class="token operator">=</span> scrapy<span class="token punctuation">.</span>Field<span class="token punctuation">(</span><span class="token punctuation">)</span>
    actor <span class="token operator">=</span> scrapy<span class="token punctuation">.</span>Field<span class="token punctuation">(</span><span class="token punctuation">)</span>
    time <span class="token operator">=</span> scrapy<span class="token punctuation">.</span>Field<span class="token punctuation">(</span><span class="token punctuation">)</span>
    score <span class="token operator">=</span> scrapy<span class="token punctuation">.</span>Field<span class="token punctuation">(</span><span class="token punctuation">)</span>
 




<span class="token comment" spellcheck="true"># pipelines.py 这里是文件名</span>
<span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>

<span class="token comment" spellcheck="true"># Define your item pipelines here</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true"># Don't forget to add your pipeline to the ITEM_PIPELINES setting</span>
<span class="token comment" spellcheck="true"># See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html</span>
<span class="token keyword">import</span> pymongo
<span class="token keyword">from</span> maoyanPro<span class="token punctuation">.</span>items <span class="token keyword">import</span> MaoyanproItem

<span class="token comment" spellcheck="true"># client = pymongo.MongoClient(host='localhost', port=27017)</span>
<span class="token comment" spellcheck="true"># db = client['test']</span>
<span class="token comment" spellcheck="true"># collection = db['maoyan_2']</span>


<span class="token comment" spellcheck="true"># class MaoyanproPipeline(object):</span>
<span class="token comment" spellcheck="true">#     def process_item(self, item, spider):</span>
<span class="token comment" spellcheck="true">#         print(item)</span>
<span class="token comment" spellcheck="true">#         collection.insert_one(dict(item))</span>
<span class="token comment" spellcheck="true">#         return item</span>
<span class="token comment" spellcheck="true"># import pymysql</span>
<span class="token comment" spellcheck="true"># from maoyan import settings</span>
<span class="token keyword">import</span> pymysql

<span class="token keyword">class</span> <span class="token class-name">MaoyanPipeline</span><span class="token punctuation">(</span>object<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">def</span> <span class="token function">process_item</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> item<span class="token punctuation">,</span> spider<span class="token punctuation">)</span><span class="token punctuation">:</span>
        title <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">]</span>
        actor <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'actor'</span><span class="token punctuation">]</span>
        time <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'time'</span><span class="token punctuation">]</span>
        score <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'score'</span><span class="token punctuation">]</span>
        <span class="token comment" spellcheck="true"># ********** Begin **********#</span>
        <span class="token comment" spellcheck="true"># 1.连接数据库</span>
        conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>
            host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 连接的是本地数据库</span>
            port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 数据库端口名</span>
            user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 自己的mysql用户名</span>
            passwd<span class="token operator">=</span><span class="token string">'qaz3357375'</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 自己的密码</span>
            db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 数据库的名字</span>
            charset<span class="token operator">=</span><span class="token string">'utf8'</span><span class="token punctuation">,</span>  <span class="token comment" spellcheck="true"># 默认的编码方式</span>
        <span class="token punctuation">)</span>
        cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">try</span><span class="token punctuation">:</span>
            sql <span class="token operator">=</span> <span class="token string">'create table if not exists mymovies(title varchar(235), actor varchar(235), time varchar(235), score varchar(235))charset utf8'</span>
            cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
            <span class="token comment" spellcheck="true"># sql = 'insert into mymovies values (\'%s\',\'%s\',\'%s\',\'%s\')' % (name, starts, releasetime, score)</span>
            sql <span class="token operator">=</span> <span class="token string">'insert into mymovies values ("%s","%s","%s","%s")'</span> <span class="token operator">%</span> <span class="token punctuation">(</span>title<span class="token punctuation">,</span> actor<span class="token punctuation">,</span> time<span class="token punctuation">,</span> score<span class="token punctuation">)</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>title<span class="token operator">+</span><span class="token string">"插入成功"</span><span class="token punctuation">)</span>
            cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
            conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">except</span> Exception <span class="token keyword">as</span> e<span class="token punctuation">:</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>f<span class="token string">'错误：{e}'</span><span class="token punctuation">)</span>
        <span class="token keyword">finally</span><span class="token punctuation">:</span>
            conn<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">return</span> item
</code></pre>
<h3 id="2-爬取豆瓣电影数据"><a href="#2-爬取豆瓣电影数据" class="headerlink" title="2. 爬取豆瓣电影数据"></a>2. 爬取豆瓣电影数据</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> csv
<span class="token keyword">import</span> requests
<span class="token keyword">import</span> pymysql
<span class="token keyword">import</span> os
<span class="token triple-quoted-string string">'''
需求分析：
此网站为ajax加载
1.爬豆瓣电影数据的名称，评分，图片，视频url。
2.写入到csv和MySQL数据库
'''</span>
<span class="token string">'https://www.douban.com'</span>
<span class="token string">'https://movie.douban.com/'</span>
<span class="token string">'https://movie.douban.com/explore#!type=movie&amp;tag=%E6%9C%80%E6%96%B0&amp;page_limit=20&amp;page_start=0'</span>
<span class="token string">'https://movie.douban.com/j/search_subjects?type=movie&amp;tag=%E6%9C%80%E6%96%B0&amp;page_limit=20&amp;page_start=0'</span>
<span class="token string">'https://movie.douban.com/explore#!type=movie&amp;tag=%E6%9C%80%E6%96%B0&amp;page_limit=20&amp;page_start=20'</span>

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'</span>

<span class="token punctuation">}</span>


<span class="token keyword">def</span> <span class="token function">response_url</span><span class="token punctuation">(</span>url<span class="token punctuation">)</span><span class="token punctuation">:</span>
    data_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
    <span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">,</span> <span class="token number">100</span><span class="token punctuation">,</span> <span class="token number">20</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
        params <span class="token operator">=</span> <span class="token punctuation">{</span>
            <span class="token string">'type'</span><span class="token punctuation">:</span> <span class="token string">'movie'</span><span class="token punctuation">,</span>
            <span class="token string">'tag'</span><span class="token punctuation">:</span> <span class="token string">'最新'</span><span class="token punctuation">,</span>
            <span class="token string">'page_limit'</span><span class="token punctuation">:</span> <span class="token string">'20'</span><span class="token punctuation">,</span>
            <span class="token string">'page_start'</span><span class="token punctuation">:</span> i
        <span class="token punctuation">}</span>
        response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>url<span class="token punctuation">,</span> params<span class="token operator">=</span>params<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>json<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">for</span> movie <span class="token keyword">in</span> response<span class="token punctuation">[</span><span class="token string">'subjects'</span><span class="token punctuation">]</span><span class="token punctuation">:</span>
            title <span class="token operator">=</span> movie<span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">]</span>
            score <span class="token operator">=</span> movie<span class="token punctuation">[</span><span class="token string">'rate'</span><span class="token punctuation">]</span>
            video_url <span class="token operator">=</span> movie<span class="token punctuation">[</span><span class="token string">'url'</span><span class="token punctuation">]</span>
            img <span class="token operator">=</span> movie<span class="token punctuation">[</span><span class="token string">'cover'</span><span class="token punctuation">]</span>

            data <span class="token operator">=</span> <span class="token punctuation">{</span>
                <span class="token string">'title'</span><span class="token punctuation">:</span> title<span class="token punctuation">,</span>
                <span class="token string">'score'</span><span class="token punctuation">:</span> score<span class="token punctuation">,</span>
                <span class="token string">'video_url'</span><span class="token punctuation">:</span> video_url<span class="token punctuation">,</span>
                <span class="token string">'img'</span><span class="token punctuation">:</span> img
            <span class="token punctuation">}</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span>
            data_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>data<span class="token punctuation">)</span>

            <span class="token comment" spellcheck="true"># 存储图片</span>
            path <span class="token operator">=</span> <span class="token string">'douban_img/'</span>
            <span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span>path<span class="token punctuation">)</span><span class="token punctuation">:</span>
                os<span class="token punctuation">.</span>makedirs<span class="token punctuation">(</span>path<span class="token punctuation">)</span>
            img_data <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>img<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content
            img_name <span class="token operator">=</span> img<span class="token punctuation">.</span>split<span class="token punctuation">(</span><span class="token string">'/'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
            <span class="token keyword">with</span> open<span class="token punctuation">(</span>path<span class="token operator">+</span>img_name<span class="token punctuation">,</span> <span class="token string">'wb'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
                f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>img_data<span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># insert_csv(data_list)</span>


<span class="token keyword">def</span> <span class="token function">insert_csv</span><span class="token punctuation">(</span>data_list<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'./豆瓣.csv'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">,</span> newline<span class="token operator">=</span><span class="token string">''</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
        fieldnames <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">,</span> <span class="token string">'score'</span><span class="token punctuation">,</span> <span class="token string">'video_url'</span><span class="token punctuation">,</span> <span class="token string">'img'</span><span class="token punctuation">]</span>
        writer <span class="token operator">=</span> csv<span class="token punctuation">.</span>DictWriter<span class="token punctuation">(</span>f<span class="token punctuation">,</span> fieldnames<span class="token operator">=</span>fieldnames<span class="token punctuation">)</span>
        writer<span class="token punctuation">.</span>writeheader<span class="token punctuation">(</span><span class="token punctuation">)</span>
        writer<span class="token punctuation">.</span>writerows<span class="token punctuation">(</span>data_list<span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># insert_mysql()</span>


<span class="token keyword">def</span> <span class="token function">insert_mysql</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'qaz3357375'</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders2'</span><span class="token punctuation">,</span> local_infile<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>
    cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
    sql <span class="token operator">=</span> <span class="token string">'create table if not exists douban(title varchar(235), score varchar(235), video_url varchar(235), img varchar(235))'</span>
    cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
    <span class="token keyword">try</span><span class="token punctuation">:</span>
        sql <span class="token operator">=</span> <span class="token string">"load data local infile 'D:/learning/spider/project/爬虫案例/豆瓣.csv' into table douban fields terminated by ',' lines terminated by '\n' ignore 1 lines"</span>
        cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
        conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
    <span class="token keyword">except</span> Exception <span class="token keyword">as</span> e<span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>f<span class="token string">'错误：{e}'</span><span class="token punctuation">)</span>
    <span class="token keyword">finally</span><span class="token punctuation">:</span>
        conn<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>


<span class="token keyword">if</span> __name__ <span class="token operator">==</span> <span class="token string">'__main__'</span><span class="token punctuation">:</span>
    url <span class="token operator">=</span> <span class="token string">'https://movie.douban.com/j/search_subjects?'</span>
    response_url<span class="token punctuation">(</span>url<span class="token punctuation">)</span>
</code></pre>
<h3 id="3-爬取小说"><a href="#3-爬取小说" class="headerlink" title="3.爬取小说"></a>3.爬取小说</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> os
<span class="token keyword">import</span> parsel
<span class="token keyword">import</span> requests

<span class="token string">'https://www.xsbiquge.com'</span>
<span class="token string">'https://www.xsbiquge.com/20_20331/1135932.html'</span>
<span class="token string">'https://www.xsbiquge.com/20_20331/1135933.html'</span>
headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'Host'</span><span class="token punctuation">:</span> <span class="token string">'www.xsbiquge.com'</span><span class="token punctuation">,</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'</span><span class="token punctuation">,</span>
    <span class="token string">'Accept'</span><span class="token punctuation">:</span> <span class="token string">'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'</span><span class="token punctuation">,</span>
    <span class="token string">'Accept-Language'</span><span class="token punctuation">:</span> <span class="token string">'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'</span><span class="token punctuation">,</span>
    <span class="token string">'Accept-Encoding'</span><span class="token punctuation">:</span> <span class="token string">'gzip, deflate, br'</span><span class="token punctuation">,</span>
    <span class="token string">'Referer'</span><span class="token punctuation">:</span> <span class="token string">'https://www.baidu.com/link?url=gREzciPAbw8dLQOU3UGpX-fvNBEUVHtO7HU5hwdzIgWZIbq9jRhRSkYosvPBCdRP&amp;wd=&amp;eqid=cf2722fb00007d98000000045fb2203b'</span><span class="token punctuation">,</span>
    <span class="token string">'Connection'</span><span class="token punctuation">:</span> <span class="token string">'keep-alive'</span><span class="token punctuation">,</span>
    <span class="token string">'Cookie'</span><span class="token punctuation">:</span> <span class="token string">'PPad_id_PP=3; bcolor=; font=; size=; fontcolor=; width='</span><span class="token punctuation">,</span>
    <span class="token string">'Upgrade-Insecure-Requests'</span><span class="token punctuation">:</span> <span class="token string">'1'</span><span class="token punctuation">,</span>
    <span class="token string">'If-Modified-Since'</span><span class="token punctuation">:</span> <span class="token string">'Sun, 15 Nov 2020 16:52:47 GMT'</span><span class="token punctuation">,</span>
    <span class="token string">'If-None-Match'</span><span class="token punctuation">:</span> <span class="token string">'W/"5fb15cdf-353e1"'</span><span class="token punctuation">,</span>
    <span class="token string">'Cache-Control'</span><span class="token punctuation">:</span> <span class="token string">'max-age=0'</span>
<span class="token punctuation">}</span>


<span class="token keyword">def</span> <span class="token function">urls_info</span><span class="token punctuation">(</span>url<span class="token punctuation">)</span><span class="token punctuation">:</span>
    response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># print(response)</span>
    sel <span class="token operator">=</span> parsel<span class="token punctuation">.</span>Selector<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
    htmls <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
    use_info <span class="token operator">=</span> sel<span class="token punctuation">.</span>css<span class="token punctuation">(</span><span class="token string">'dl dd a::attr(href)'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>getall<span class="token punctuation">(</span><span class="token punctuation">)</span>
    <span class="token keyword">for</span> i <span class="token keyword">in</span> use_info<span class="token punctuation">:</span>
        htmls<span class="token punctuation">.</span>append<span class="token punctuation">(</span>url <span class="token operator">+</span> <span class="token punctuation">(</span>i<span class="token punctuation">[</span><span class="token number">10</span><span class="token punctuation">:</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># print(htmls)</span>
    page_content<span class="token punctuation">(</span>htmls<span class="token punctuation">)</span>


<span class="token string">'&lt;dl>'</span>
<span class="token string">'&lt;dd>&lt;a href="/20_20331/1135932.html">第1章 八百年后&lt;/a>&lt;/dd>'</span>
<span class="token string">'&lt;/dl>'</span>


<span class="token keyword">def</span> <span class="token function">page_content</span><span class="token punctuation">(</span>htmls<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">for</span> html <span class="token keyword">in</span> htmls<span class="token punctuation">:</span>
        response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>html<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
        path <span class="token operator">=</span> <span class="token string">'02_万古神帝/'</span>
        <span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span>path<span class="token punctuation">)</span><span class="token punctuation">:</span>
            os<span class="token punctuation">.</span>makedirs<span class="token punctuation">(</span>path<span class="token punctuation">)</span>

        sel <span class="token operator">=</span> parsel<span class="token punctuation">.</span>Selector<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
        title <span class="token operator">=</span> sel<span class="token punctuation">.</span>css<span class="token punctuation">(</span><span class="token string">'h1::text'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>get<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>title<span class="token punctuation">)</span>
        contents <span class="token operator">=</span> sel<span class="token punctuation">.</span>css<span class="token punctuation">(</span><span class="token string">'div#content::text'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>getall<span class="token punctuation">(</span><span class="token punctuation">)</span>

        <span class="token keyword">for</span> content <span class="token keyword">in</span> contents<span class="token punctuation">:</span>
            content <span class="token operator">=</span> str<span class="token punctuation">(</span>content<span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'&amp;#039'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>
            <span class="token keyword">with</span> open<span class="token punctuation">(</span>path<span class="token operator">+</span>str<span class="token punctuation">(</span>title<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'.txt'</span><span class="token punctuation">,</span> <span class="token string">'a'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
                f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>content <span class="token operator">+</span> <span class="token string">'\n'</span><span class="token punctuation">)</span>


<span class="token string">'html body div#wrapper div.content_read div.box_con div#content'</span>
<span class="token keyword">if</span> __name__ <span class="token operator">==</span> <span class="token string">'__main__'</span><span class="token punctuation">:</span>
    url <span class="token operator">=</span> <span class="token string">'https://www.xsbiquge.com/20_20331/'</span>
    urls_info<span class="token punctuation">(</span>url<span class="token punctuation">)</span>
</code></pre>
<h3 id="4-使用scrapy框架爬取小说"><a href="#4-使用scrapy框架爬取小说" class="headerlink" title="4.使用scrapy框架爬取小说"></a>4.使用scrapy框架爬取小说</h3><pre class=" language-python"><code class="language-python"><span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>
<span class="token keyword">import</span> scrapy
<span class="token keyword">from</span> wanguPro<span class="token punctuation">.</span>items <span class="token keyword">import</span> WanguproItem

<span class="token keyword">class</span> <span class="token class-name">WanguSpider</span><span class="token punctuation">(</span>scrapy<span class="token punctuation">.</span>Spider<span class="token punctuation">)</span><span class="token punctuation">:</span>
    name <span class="token operator">=</span> <span class="token string">'wangu'</span>
    <span class="token comment" spellcheck="true"># allowed_domains = ['www.xsbiquge.com/20_20331/']</span>
    start_urls <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'http://www.xsbiquge.com/20_20331//'</span><span class="token punctuation">]</span>

    <span class="token keyword">def</span> <span class="token function">parse</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> response<span class="token punctuation">)</span><span class="token punctuation">:</span>
        detail_urls <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@id="list"]/dl/dd/a/@href'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">for</span> detail_url <span class="token keyword">in</span> detail_urls<span class="token punctuation">:</span>
            detail_url <span class="token operator">=</span> <span class="token string">'https://www.xsbiquge.com'</span> <span class="token operator">+</span> detail_url
            <span class="token comment" spellcheck="true"># print(detail_url)</span>

            <span class="token keyword">yield</span> scrapy<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>detail_url<span class="token punctuation">,</span> callback<span class="token operator">=</span>self<span class="token punctuation">.</span>detail_parse<span class="token punctuation">)</span>

    <span class="token keyword">def</span> <span class="token function">detail_parse</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> response<span class="token punctuation">)</span><span class="token punctuation">:</span>
        title <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//h1/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
        content <span class="token operator">=</span> <span class="token string">''</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@id="content"]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\xa0\xa0\xa0\xa0'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'&amp;#039;'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>
        <span class="token comment" spellcheck="true"># print(title)</span>
        item <span class="token operator">=</span> WanguproItem<span class="token punctuation">(</span><span class="token punctuation">)</span>
        item<span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">]</span> <span class="token operator">=</span> title
        item<span class="token punctuation">[</span><span class="token string">'content'</span><span class="token punctuation">]</span> <span class="token operator">=</span> content
        <span class="token keyword">yield</span> item

 



<span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>

<span class="token comment" spellcheck="true"># Define your item pipelines here</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true"># Don't forget to add your pipeline to the ITEM_PIPELINES setting</span>
<span class="token comment" spellcheck="true"># See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html</span>
<span class="token keyword">import</span> os

<span class="token keyword">class</span> <span class="token class-name">WanguproPipeline</span><span class="token punctuation">(</span>object<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">def</span> <span class="token function">process_item</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> item<span class="token punctuation">,</span> spider<span class="token punctuation">)</span><span class="token punctuation">:</span>
        path <span class="token operator">=</span> <span class="token string">'万古神帝/'</span>
        <span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span>path<span class="token punctuation">)</span><span class="token punctuation">:</span>
            os<span class="token punctuation">.</span>mkdir<span class="token punctuation">(</span>path<span class="token punctuation">)</span>
        title <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">]</span>
        title <span class="token operator">=</span> title<span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'?'</span><span class="token punctuation">,</span> <span class="token string">'？'</span><span class="token punctuation">)</span>
        content <span class="token operator">=</span> item<span class="token punctuation">[</span><span class="token string">'content'</span><span class="token punctuation">]</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>f<span class="token string">'{title} 写入成功'</span><span class="token punctuation">)</span>
        <span class="token keyword">with</span> open<span class="token punctuation">(</span>path<span class="token operator">+</span>title<span class="token operator">+</span><span class="token string">'.txt'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
            f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>content<span class="token punctuation">)</span>
        <span class="token keyword">return</span> item
</code></pre>
<h3 id="5-爬取前程无忧招聘数据-正则"><a href="#5-爬取前程无忧招聘数据-正则" class="headerlink" title="5.爬取前程无忧招聘数据(正则)"></a>5.爬取前程无忧招聘数据(正则)</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> re
<span class="token keyword">import</span> requests
<span class="token keyword">import</span> csv
<span class="token string">'https://search.51job.com'</span>
<span class="token string">'https://search.51job.com/list/190000,000000,0000,00,9,99,Java,2,1.html?lang=c&amp;postchannel=0000&amp;workyear=99&amp;cotype=99&amp;degreefrom=99&amp;jobterm=99&amp;companysize=99&amp;ord_field=0&amp;dibiaoid=0&amp;line=&amp;welfare='</span>
<span class="token string">'https://search.51job.com/list/190000,000000,0000,00,9,99,python,2,1.html?lang=c&amp;postchannel=0000&amp;workyear=99&amp;cotype=99&amp;degreefrom=99&amp;jobterm=99&amp;companysize=99&amp;ord_field=0&amp;dibiaoid=0&amp;line=&amp;welfare='</span>
<span class="token string">'https://search.51job.com/list/190000,000000,0000,00,9,99,python,2,2.html?lang=c&amp;postchannel=0000&amp;workyear=99&amp;cotype=99&amp;degreefrom=99&amp;jobterm=99&amp;companysize=99&amp;ord_field=0&amp;dibiaoid=0&amp;line=&amp;welfare='</span>
<span class="token string">'https://search.51job.com/list/190000,000000,0000,00,9,99,python,2,1.html?lang=c&amp;postchannel=0000&amp;workyear=99&amp;cotype=99&amp;degreefrom=99&amp;jobterm=99&amp;companysize=99&amp;ord_field=0&amp;dibiaoid=0&amp;line=&amp;welfare='</span>
<span class="token triple-quoted-string string">'''
lang: c
postchannel: 0000
workyear: 99
cotype: 99
degreefrom: 99
jobterm: 99
companysize: 99
ord_field: 0
dibiaoid: 0
line: 
welfare: 
'''</span>
word <span class="token operator">=</span> input<span class="token punctuation">(</span><span class="token string">'请输入搜索关键字：'</span><span class="token punctuation">)</span>
base_url <span class="token operator">=</span> <span class="token string">'https://search.51job.com/list/190000,000000,0000,00,9,99,{},2,{}.html?'</span>

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'</span>
<span class="token punctuation">}</span>
params <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'lang'</span><span class="token punctuation">:</span> <span class="token string">'c'</span><span class="token punctuation">,</span>
    <span class="token string">'postchannel'</span><span class="token punctuation">:</span> <span class="token string">'0000'</span><span class="token punctuation">,</span>
    <span class="token string">'workyear'</span><span class="token punctuation">:</span> <span class="token string">'99'</span><span class="token punctuation">,</span>
    <span class="token string">'cotype'</span><span class="token punctuation">:</span> <span class="token string">'99'</span><span class="token punctuation">,</span>
    <span class="token string">'degreefrom'</span><span class="token punctuation">:</span> <span class="token string">'99'</span><span class="token punctuation">,</span>
    <span class="token string">'jobterm'</span><span class="token punctuation">:</span> <span class="token string">'99'</span><span class="token punctuation">,</span>
    <span class="token string">'companysize'</span><span class="token punctuation">:</span> <span class="token string">'99'</span><span class="token punctuation">,</span>
    <span class="token string">'ord_field'</span><span class="token punctuation">:</span> <span class="token string">'0'</span><span class="token punctuation">,</span>
    <span class="token string">'dibiaoid'</span><span class="token punctuation">:</span> <span class="token string">'0'</span><span class="token punctuation">,</span>
    <span class="token string">'line'</span><span class="token punctuation">:</span> <span class="token string">''</span><span class="token punctuation">,</span>
    <span class="token string">'welfare'</span><span class="token punctuation">:</span> <span class="token string">''</span><span class="token punctuation">,</span>
<span class="token punctuation">}</span>
end_page <span class="token operator">=</span> input<span class="token punctuation">(</span><span class="token string">"请输入结束页："</span><span class="token punctuation">)</span>
data_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
<span class="token keyword">for</span> page <span class="token keyword">in</span> range<span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">,</span> int<span class="token punctuation">(</span>end_page<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    url <span class="token operator">=</span> base_url<span class="token punctuation">.</span>format<span class="token punctuation">(</span>word<span class="token punctuation">,</span> end_page<span class="token punctuation">)</span>
    response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>url<span class="token punctuation">,</span> params<span class="token operator">=</span>params<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'gbk'</span><span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># print(response)</span>
    <span class="token comment" spellcheck="true"># position = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># work_place = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># salary = re.findall(r'', response, re.S)[0]</span>
    <span class="token comment" spellcheck="true"># update = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># welfare = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># company = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># company_type = re.findall(r'', response, re.S)</span>
    <span class="token comment" spellcheck="true"># work_year = re.findall(r'',response, re.S)</span>
    pattern <span class="token operator">=</span> r<span class="token string">'"job_name":"(.*?)".*?"company_name":"(.*?)".*?providesalary_text":"(.*?)".*?"workarea_text":"(.*?)".*?companytype_text":"(.*?)".*?workyear":"(.*?)".*?"issuedate":"(.*?)".*?jobwelf":"(.*?)"'</span>
    content <span class="token operator">=</span> re<span class="token punctuation">.</span>findall<span class="token punctuation">(</span>pattern<span class="token punctuation">,</span> response<span class="token punctuation">,</span> re<span class="token punctuation">.</span>S<span class="token punctuation">)</span>
    <span class="token comment" spellcheck="true"># print(content)</span>
    <span class="token keyword">for</span> info <span class="token keyword">in</span> content<span class="token punctuation">:</span>
        data <span class="token operator">=</span> <span class="token punctuation">{</span>
            <span class="token string">'position'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\\'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
            <span class="token string">'company'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
            <span class="token string">'salary'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">2</span><span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\\'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
            <span class="token string">'work_place'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">3</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
            <span class="token string">'company_type'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">4</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
            <span class="token string">'work_year'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">5</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
            <span class="token string">'update'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">6</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
            <span class="token string">'welfare'</span><span class="token punctuation">:</span> info<span class="token punctuation">[</span><span class="token number">7</span><span class="token punctuation">]</span>
        <span class="token punctuation">}</span>
        data_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>data<span class="token punctuation">)</span>

<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'前程.csv'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">,</span> newline<span class="token operator">=</span><span class="token string">''</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    fieldnames <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'position'</span><span class="token punctuation">,</span> <span class="token string">'company'</span><span class="token punctuation">,</span> <span class="token string">'salary'</span><span class="token punctuation">,</span> <span class="token string">'work_place'</span><span class="token punctuation">,</span> <span class="token string">'company_type'</span><span class="token punctuation">,</span> <span class="token string">'work_year'</span><span class="token punctuation">,</span> <span class="token string">'update'</span><span class="token punctuation">,</span> <span class="token string">'welfare'</span><span class="token punctuation">]</span>
    writer <span class="token operator">=</span> csv<span class="token punctuation">.</span>DictWriter<span class="token punctuation">(</span>f<span class="token punctuation">,</span> fieldnames<span class="token operator">=</span>fieldnames<span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writeheader<span class="token punctuation">(</span><span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writerows<span class="token punctuation">(</span>data_list<span class="token punctuation">)</span>
</code></pre>
<h3 id="6-爬取图片（使用xpath）"><a href="#6-爬取图片（使用xpath）" class="headerlink" title="6.爬取图片（使用xpath）"></a>6.爬取图片（使用xpath）</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> requests
<span class="token keyword">import</span> os
<span class="token keyword">from</span> lxml <span class="token keyword">import</span> etree

<span class="token keyword">if</span> __name__ <span class="token operator">==</span> <span class="token string">'__main__'</span><span class="token punctuation">:</span>

    headers <span class="token operator">=</span> <span class="token punctuation">{</span>
        <span class="token string">'User-Agent'</span><span class="token punctuation">:</span>
            <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'</span>
    <span class="token punctuation">}</span>

    urls <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'http://pic.netbian.com/4kdongman/index_{}.html'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>i<span class="token punctuation">)</span> <span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">,</span> <span class="token number">147</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
    <span class="token keyword">for</span> url <span class="token keyword">in</span> urls<span class="token punctuation">:</span>
        response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>text

        <span class="token comment" spellcheck="true"># 数据解析</span>
        tree <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
        <span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span><span class="token string">'./picLibs_dongman'</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
            os<span class="token punctuation">.</span>mkdir<span class="token punctuation">(</span><span class="token string">'./picLibs_dongman'</span><span class="token punctuation">)</span>
        li_list <span class="token operator">=</span> tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="slist"]/ul/li'</span><span class="token punctuation">)</span>
        <span class="token keyword">for</span> li <span class="token keyword">in</span> li_list<span class="token punctuation">:</span>
            img_href <span class="token operator">=</span> li<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./a/@href'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>  <span class="token comment" spellcheck="true"># /tupian/26531.html</span>
            img_href <span class="token operator">=</span> <span class="token string">'http://pic.netbian.com'</span><span class="token operator">+</span>img_href
            <span class="token comment" spellcheck="true"># print(img_href)  # http://pic.netbian.com/tupian/26531.html</span>

            response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>img_href<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>text
            tree <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
            img_src <span class="token operator">=</span> tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="photo-pic"]/a/img/@src'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            img_src <span class="token operator">=</span> <span class="token string">'http://pic.netbian.com'</span><span class="token operator">+</span>img_src
            <span class="token comment" spellcheck="true"># print(img_src)  # http://pic.netbian.com/uploads/allimg/201113/003901-1605199141caf8.jpg</span>

            img_name <span class="token operator">=</span> tree<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="photo-pic"]/a/img/@title'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            img_name <span class="token operator">=</span> img_name<span class="token operator">+</span><span class="token string">'.jpg'</span>
            img_name <span class="token operator">=</span> img_name<span class="token punctuation">.</span>encode<span class="token punctuation">(</span><span class="token string">'iso-8859-1'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'gbk'</span><span class="token punctuation">)</span>

            img_data <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>img_src<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content

            img_path <span class="token operator">=</span> <span class="token string">'picLibs_dongman/'</span><span class="token operator">+</span>img_name
            <span class="token keyword">with</span> open<span class="token punctuation">(</span>img_path<span class="token punctuation">,</span> <span class="token string">'wb'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
                f<span class="token punctuation">.</span>write<span class="token punctuation">(</span>img_data<span class="token punctuation">)</span>
                <span class="token keyword">print</span><span class="token punctuation">(</span>f<span class="token string">'{img_name} 下载成功'</span><span class="token punctuation">)</span>
</code></pre>
<h3 id="7-爬取招聘信息（xpath-base64）"><a href="#7-爬取招聘信息（xpath-base64）" class="headerlink" title="7.爬取招聘信息（xpath,base64）"></a>7.爬取招聘信息（xpath,base64）</h3><pre class=" language-python"><code class="language-python"><span class="token keyword">import</span> requests
<span class="token keyword">import</span> base64
<span class="token keyword">import</span> csv
<span class="token keyword">import</span> pymysql
<span class="token keyword">from</span> lxml <span class="token keyword">import</span> etree
<span class="token keyword">import</span> time

<span class="token string">'http://localhost:8080/getPosition?id=MTA4MQ=='</span>
<span class="token string">'id="1082"'</span>
<span class="token string">'http://localhost:8080/getPosition?id=MTA4Mg=='</span>
<span class="token string">'http://localhost:8080/getPosition?id=MTA4Mw=='</span>
<span class="token string">'http://localhost:8080/getPosition?id=MTA4Ng=='</span>
<span class="token string">'MTA4MQ'</span>
<span class="token string">'1081'</span>
<span class="token string">'http://localhost:8080/page/MS1hbGljZQ==/12/749'</span>
<span class="token string">'http://localhost:8080/page/Mi1hbGljZQ==/12/749'</span>
<span class="token string">'http://localhost:8080/page/My1hbGljZQ==/12/749'</span>
<span class="token string">'http://localhost:8080/page/NC1hbGljZQ==/12/749'</span>
<span class="token string">'http://localhost:8080/page/NS1hbGljZQ==/12/749'</span>
<span class="token string">'http://localhost:8080/page/Mi1hbGljZQ==/12/749'</span>
headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'</span>
<span class="token punctuation">}</span>
data_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>


<span class="token keyword">def</span> <span class="token function">detail_data</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">,</span> <span class="token number">800</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
        page <span class="token operator">=</span> str<span class="token punctuation">(</span>i<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'-alice'</span>
        result <span class="token operator">=</span> base64<span class="token punctuation">.</span>b64encode<span class="token punctuation">(</span>page<span class="token punctuation">.</span>encode<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token punctuation">)</span>
        url <span class="token operator">=</span> <span class="token string">'http://localhost:8080/page/'</span> <span class="token operator">+</span> result <span class="token operator">+</span> <span class="token string">'/12/749'</span>
        response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
        sel <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
        id_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//legend/a/@id'</span><span class="token punctuation">)</span>

        edu_level_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]//div/fieldset//div[1]/div[4]/text()'</span><span class="token punctuation">)</span>
        create_time_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="info"]/span[2]/text()'</span><span class="token punctuation">)</span>
        company_size_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="layui-row"]/div[@class="layui-col-md2"]/span/text()'</span><span class="token punctuation">)</span>
        workingExp_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]/div//fieldset//div[1]/div[3] /text()'</span><span class="token punctuation">)</span>
        company_name_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        job_name_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        salary_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        city_name_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        welfare_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        responsibility_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        place_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>

        <span class="token keyword">for</span> i <span class="token keyword">in</span> id_list<span class="token punctuation">:</span>
            result <span class="token operator">=</span> base64<span class="token punctuation">.</span>b64encode<span class="token punctuation">(</span>i<span class="token punctuation">.</span>encode<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token punctuation">)</span>
            <span class="token comment" spellcheck="true"># print(result)</span>
            detail_url <span class="token operator">=</span> <span class="token string">'http://localhost:8080/getPosition?id='</span> <span class="token operator">+</span> result
            <span class="token comment" spellcheck="true"># print(detail_url)</span>
            response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>detail_url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
            sel <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>

            company_name <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="affix-side"]/div[1]/div/div/div[2]/p/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            company_name_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>company_name<span class="token punctuation">)</span>

            job_name <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//h3/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            job_name_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>job_name<span class="token punctuation">)</span>

            salary <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="layui-card"]/div[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            salary_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>salary<span class="token punctuation">)</span>

            city_name <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="affix-side"]/div[1]//div[2]/span/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            city_name_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>city_name<span class="token punctuation">)</span>

            welfare <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]//div[2]//div[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            welfare_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>welfare<span class="token punctuation">)</span>

            <span class="token keyword">if</span> len<span class="token punctuation">(</span>sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]//div[3]//div[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token operator">==</span> <span class="token number">0</span><span class="token punctuation">:</span>
                responsibility <span class="token operator">=</span> <span class="token string">'null'</span>
            <span class="token keyword">else</span><span class="token punctuation">:</span>
                responsibility <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]//div[3]//div[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            responsibility_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>responsibility<span class="token punctuation">)</span>

            place <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="article-list"]//div[4]/div/div[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            place_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>place<span class="token punctuation">)</span>

        <span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span>len<span class="token punctuation">(</span>company_name_list<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
            data <span class="token operator">=</span> <span class="token punctuation">{</span>
                <span class="token string">'company_name'</span><span class="token punctuation">:</span> company_name_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'edu_level'</span><span class="token punctuation">:</span> edu_level_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'job_name'</span><span class="token punctuation">:</span> job_name_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'salary'</span><span class="token punctuation">:</span> salary_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'create_time'</span><span class="token punctuation">:</span> create_time_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'city_name'</span><span class="token punctuation">:</span> city_name_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'company_size'</span><span class="token punctuation">:</span> company_size_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">,</span>
                <span class="token string">'welfare'</span><span class="token punctuation">:</span> welfare_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">','</span><span class="token punctuation">,</span> <span class="token string">'，'</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
                <span class="token string">'responsibility'</span><span class="token punctuation">:</span> responsibility_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">','</span><span class="token punctuation">,</span> <span class="token string">'，'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\xa0'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
                <span class="token string">'place'</span><span class="token punctuation">:</span> place_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">','</span><span class="token punctuation">,</span> <span class="token string">'，'</span><span class="token punctuation">)</span><span class="token punctuation">,</span>
                <span class="token string">'workingExp'</span><span class="token punctuation">:</span> workingExp_list<span class="token punctuation">[</span>i<span class="token punctuation">]</span>

            <span class="token punctuation">}</span>

            <span class="token keyword">print</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span>
            data_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>data<span class="token punctuation">)</span>
            <span class="token comment" spellcheck="true"># insert_mysql(data)</span>
    insert_csv<span class="token punctuation">(</span>data_list<span class="token punctuation">)</span>


<span class="token keyword">def</span> <span class="token function">insert_csv</span><span class="token punctuation">(</span>data_list<span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'cn_01_recruitment.csv'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">,</span> newline<span class="token operator">=</span><span class="token string">''</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
        fieldnames <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'company_name'</span><span class="token punctuation">,</span> <span class="token string">'edu_level'</span><span class="token punctuation">,</span> <span class="token string">'job_name'</span><span class="token punctuation">,</span> <span class="token string">'salary'</span><span class="token punctuation">,</span> <span class="token string">'create_time'</span><span class="token punctuation">,</span> <span class="token string">'city_name'</span><span class="token punctuation">,</span> <span class="token string">'company_size'</span><span class="token punctuation">,</span>
                      <span class="token string">'welfare'</span><span class="token punctuation">,</span> <span class="token string">'responsibility'</span><span class="token punctuation">,</span> <span class="token string">'place'</span><span class="token punctuation">,</span> <span class="token string">'workingExp'</span><span class="token punctuation">]</span>
        writer <span class="token operator">=</span> csv<span class="token punctuation">.</span>DictWriter<span class="token punctuation">(</span>f<span class="token punctuation">,</span> fieldnames<span class="token operator">=</span>fieldnames<span class="token punctuation">)</span>
        writer<span class="token punctuation">.</span>writeheader<span class="token punctuation">(</span><span class="token punctuation">)</span>
        writer<span class="token punctuation">.</span>writerows<span class="token punctuation">(</span>data_list<span class="token punctuation">)</span>
    load_mysql<span class="token punctuation">(</span><span class="token punctuation">)</span>


<span class="token comment" spellcheck="true"># def insert_mysql(data):</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true">#     conn = pymysql.connect(host='localhost', user='root', password='root', port=3306, db='spiders')</span>
<span class="token comment" spellcheck="true">#     cursor = conn.cursor()</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true">#     sql = 'create table if not exists recruitment ( ' \</span>
<span class="token comment" spellcheck="true">#           'id int(11) primary key auto_increment, ' \</span>
<span class="token comment" spellcheck="true">#           'company_name varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'edu_level varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'job_name varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'salary varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'create_time varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'city_name varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'company_size varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'welfare varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'responsibility text, ' \</span>
<span class="token comment" spellcheck="true">#           'place varchar(255), ' \</span>
<span class="token comment" spellcheck="true">#           'workingExp varchar(255)' \</span>
<span class="token comment" spellcheck="true">#           ')charset utf8'</span>
<span class="token comment" spellcheck="true">#     cursor.execute(sql)</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true">#     table = 'recruitment'</span>
<span class="token comment" spellcheck="true">#     keys = ','.join(data.keys())</span>
<span class="token comment" spellcheck="true">#</span>
<span class="token comment" spellcheck="true">#     values = ','.join(['%s'] * len(data))</span>
<span class="token comment" spellcheck="true">#     sql = 'insert into {table}({keys}) values({values})'.format(table=table, keys=keys, values=values)</span>
<span class="token comment" spellcheck="true">#     try:</span>
<span class="token comment" spellcheck="true">#         if cursor.execute(sql, tuple(data.values())):</span>
<span class="token comment" spellcheck="true">#             print('Successful')</span>
<span class="token comment" spellcheck="true">#             conn.commit()</span>
<span class="token comment" spellcheck="true">#     except Exception as e:</span>
<span class="token comment" spellcheck="true">#         print('Failed', e)</span>
<span class="token comment" spellcheck="true">#         conn.rollback()</span>


<span class="token keyword">def</span> <span class="token function">load_mysql</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">,</span> local_infile<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>
    cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
    sql <span class="token operator">=</span> <span class="token string">'create table if not exists recruitment ( '</span> \
          <span class="token string">'company_name varchar(1255), '</span> \
          <span class="token string">'edu_level varchar(1255), '</span> \
          <span class="token string">'job_name varchar(1255), '</span> \
          <span class="token string">'salary varchar(1255), '</span> \
          <span class="token string">'create_time datetime, '</span> \
          <span class="token string">'city_name varchar(1255), '</span> \
          <span class="token string">'company_size varchar(1255), '</span> \
          <span class="token string">'welfare varchar(1255), '</span> \
          <span class="token string">'responsibility text, '</span> \
          <span class="token string">'place varchar(1255), '</span> \
          <span class="token string">'workingExp varchar(1255),'</span> \
          <span class="token string">'id int(30) primary key auto_increment '</span> \
          <span class="token string">')charset utf8'</span>
    cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
    <span class="token keyword">try</span><span class="token punctuation">:</span>
        sql <span class="token operator">=</span> <span class="token string">"load data local infile './cn_01_recruitment.csv' into table recruitment fields terminated by ',' lines terminated by '\n' ignore 1 lines"</span>
        cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
        conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'成功'</span><span class="token punctuation">)</span>
    <span class="token keyword">except</span> Exception <span class="token keyword">as</span> e<span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'失败'</span><span class="token punctuation">,</span> e<span class="token punctuation">)</span>
    <span class="token keyword">finally</span><span class="token punctuation">:</span>
        conn<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>


<span class="token keyword">if</span> __name__ <span class="token operator">==</span> <span class="token string">"__main__"</span><span class="token punctuation">:</span>
    detail_data<span class="token punctuation">(</span><span class="token punctuation">)</span>
```

<span class="token comment" spellcheck="true">### 8.爬取美团酒店信息</span>

<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>python
<span class="token keyword">import</span> requests
<span class="token keyword">import</span> json
<span class="token keyword">import</span> csv
<span class="token keyword">import</span> pymysql


url <span class="token operator">=</span> <span class="token string">'https://ihotel.meituan.com/hbsearch/HotelSearch'</span>

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span> <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'</span><span class="token punctuation">,</span>
    <span class="token string">'Referer'</span><span class="token punctuation">:</span> <span class="token string">'https://hotel.meituan.com/'</span>
<span class="token punctuation">}</span>
data_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">,</span> <span class="token number">10</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    data <span class="token operator">=</span> <span class="token punctuation">{</span>
        <span class="token string">'utm_medium'</span><span class="token punctuation">:</span> <span class="token string">'pc'</span><span class="token punctuation">,</span>
        <span class="token string">'version_name'</span><span class="token punctuation">:</span> <span class="token number">999.9</span><span class="token punctuation">,</span>
        <span class="token string">'cateId'</span><span class="token punctuation">:</span> <span class="token number">20</span><span class="token punctuation">,</span>
        <span class="token string">'attr_28'</span><span class="token punctuation">:</span> <span class="token number">129</span><span class="token punctuation">,</span>
        <span class="token string">'uuid'</span><span class="token punctuation">:</span> <span class="token string">' DA1E77214B957240252862761F1E15865B9B1AA2ACD29C83F0F61D03E8DFB644@1607773617614'</span><span class="token punctuation">,</span>
        <span class="token string">'cityId'</span><span class="token punctuation">:</span> <span class="token number">1</span><span class="token punctuation">,</span>
        <span class="token string">'offset'</span><span class="token punctuation">:</span> i <span class="token operator">*</span> <span class="token number">20</span><span class="token punctuation">,</span>
        <span class="token string">'limit'</span><span class="token punctuation">:</span> <span class="token number">20</span><span class="token punctuation">,</span>
        <span class="token string">'startDay'</span><span class="token punctuation">:</span> <span class="token number">20201212</span><span class="token punctuation">,</span>
        <span class="token string">'endDay'</span><span class="token punctuation">:</span> <span class="token number">20201212</span><span class="token punctuation">,</span>
        <span class="token string">'q'</span><span class="token punctuation">:</span> <span class="token string">''</span><span class="token punctuation">,</span>
        <span class="token string">'sort'</span><span class="token punctuation">:</span> <span class="token string">'defaults'</span><span class="token punctuation">,</span>
        <span class="token string">'X-FOR-WITH'</span><span class="token punctuation">:</span> <span class="token string">'y0S49miBphUhlSdxHk0yNx/zQUUPdVFaVHUcE7pp957986DvRcVqn1WFmBoRwkk8sysD+0+jcjaysGC3oiWjgNG1qIsmIorGCBEwXdMKoXyIdf+EQ4DZWwlrv+qY8Yhet4H+/1aKyaQfAXr4MCj//K0IjMPOwYhRX1r9yIHCe3v7W/0jrHmp94an7/7vUBwuzSaMtK70jedmB80BqgYsKA=='</span>
    <span class="token punctuation">}</span>
    response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">,</span>  params<span class="token operator">=</span>data<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
    results <span class="token operator">=</span> json<span class="token punctuation">.</span>loads<span class="token punctuation">(</span>response<span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token string">'data'</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token string">'searchresult'</span><span class="token punctuation">]</span>
    <span class="token keyword">for</span> conn <span class="token keyword">in</span> results<span class="token punctuation">:</span>
        name <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'name'</span><span class="token punctuation">]</span>  <span class="token comment" spellcheck="true"># 酒店名字</span>
        addr <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'addr'</span><span class="token punctuation">]</span>  <span class="token comment" spellcheck="true"># 酒店地址</span>
        hotelStar <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'hotelStar'</span><span class="token punctuation">]</span>  <span class="token comment" spellcheck="true"># 酒店类型</span>
        originalPrice <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'originalPrice'</span><span class="token punctuation">]</span>
        comment <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'commentsCountDesc'</span><span class="token punctuation">]</span>
        score <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'scoreIntro'</span><span class="token punctuation">]</span>
        lat <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'lat'</span><span class="token punctuation">]</span>
        lng <span class="token operator">=</span> conn<span class="token punctuation">[</span><span class="token string">'lng'</span><span class="token punctuation">]</span>
        data <span class="token operator">=</span> <span class="token punctuation">{</span>
            <span class="token string">'name'</span><span class="token punctuation">:</span> name<span class="token punctuation">,</span>
            <span class="token string">'hotelStar'</span><span class="token punctuation">:</span> hotelStar<span class="token punctuation">,</span>
            <span class="token string">'originalPrice'</span><span class="token punctuation">:</span> originalPrice<span class="token punctuation">,</span>
            <span class="token string">'comment'</span><span class="token punctuation">:</span> comment<span class="token punctuation">,</span>
            <span class="token string">'score'</span><span class="token punctuation">:</span> score<span class="token punctuation">,</span>
            <span class="token string">'addr'</span><span class="token punctuation">:</span> addr<span class="token punctuation">,</span>
            <span class="token string">'lat'</span><span class="token punctuation">:</span> lat<span class="token punctuation">,</span>
            <span class="token string">'lng'</span><span class="token punctuation">:</span> lng
        <span class="token punctuation">}</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span>
        data_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>data<span class="token punctuation">)</span>

        conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders2'</span><span class="token punctuation">,</span> charset<span class="token operator">=</span><span class="token string">'utf8'</span><span class="token punctuation">)</span>
        cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">try</span><span class="token punctuation">:</span>
            sql <span class="token operator">=</span> <span class="token string">'create table if not exists meituan('</span> \
                  <span class="token string">'name varchar(235) primary key, '</span> \
                  <span class="token string">'hotelStar varchar(235), '</span> \
                  <span class="token string">'originalPrice varchar(235), '</span> \
                  <span class="token string">'comment varchar(235), '</span> \
                  <span class="token string">'score varchar(235), '</span> \
                  <span class="token string">'addr varchar(235), '</span> \
                  <span class="token string">'lat varchar(235), '</span> \
                  <span class="token string">'lng varchar(235)'</span> \
                  <span class="token string">')charset utf8'</span>
            cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>
            table <span class="token operator">=</span> <span class="token string">'meituan'</span>
            keys <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>data<span class="token punctuation">.</span>keys<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
            values <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'%s'</span><span class="token punctuation">]</span> <span class="token operator">*</span> len<span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">)</span>
            sql <span class="token operator">=</span> <span class="token string">'insert into {table}({keys}) values({values})'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>table<span class="token operator">=</span>table<span class="token punctuation">,</span> keys<span class="token operator">=</span>keys<span class="token punctuation">,</span> values<span class="token operator">=</span>values<span class="token punctuation">)</span>
            cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">,</span> tuple<span class="token punctuation">(</span>data<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
            conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">except</span> Exception <span class="token keyword">as</span> e<span class="token punctuation">:</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">)</span>
        <span class="token keyword">finally</span><span class="token punctuation">:</span>
            conn<span class="token punctuation">.</span>close<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># with open('美团.csv', 'w', encoding='utf-8', newline='') as f:</span>
<span class="token comment" spellcheck="true">#     fieldnames = ['name', 'hotelStar', 'originalPrice', 'comment', 'score', 'addr', 'lat', 'lng']</span>
<span class="token comment" spellcheck="true">#     writer = csv.DictWriter(f, fieldnames=fieldnames)</span>
<span class="token comment" spellcheck="true">#     writer.writeheader()</span>
<span class="token comment" spellcheck="true">#     writer.writerows(data_list)</span>
<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>

<span class="token comment" spellcheck="true">### 9.爬取求是网（xpath)</span>

<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>python
<span class="token keyword">import</span> requests
<span class="token keyword">from</span> lxml <span class="token keyword">import</span> etree
<span class="token keyword">import</span> csv

headers <span class="token operator">=</span> <span class="token punctuation">{</span>
    <span class="token string">'User-Agent'</span><span class="token punctuation">:</span>
        <span class="token string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'</span>
<span class="token punctuation">}</span>

url <span class="token operator">=</span> <span class="token string">'http://www.qstheory.cn/dukan/qs/2019-01/01/c_1123932149.htm?spm=zm5062-001.0.0.1.OpkwSK'</span>

html <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>

sel <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>html<span class="token punctuation">)</span>
data_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
p_list <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="highlight"]/p'</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> p <span class="token keyword">in</span> p_list<span class="token punctuation">:</span>
    <span class="token comment" spellcheck="true"># href = p.xpath('./strong/a/@href | ./a/@href | ./span/a/@href | ./span/strong/a/@href')[0]</span>
    href <span class="token operator">=</span> p<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'.//@href'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
    <span class="token comment" spellcheck="true"># print(href)</span>
    response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>href<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
    sel <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
    detail_urls <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="highlight"]/p/a/@href | //div[@class="highlight"]/p/strong/a/@href | //div[@class="highlight"]/p//strong/a/@href'</span><span class="token punctuation">)</span>
    <span class="token keyword">for</span> detail_url <span class="token keyword">in</span> detail_urls<span class="token punctuation">:</span>
        <span class="token comment" spellcheck="true"># print(detail_url)</span>
        response <span class="token operator">=</span> requests<span class="token punctuation">.</span>get<span class="token punctuation">(</span>url<span class="token operator">=</span>detail_url<span class="token punctuation">,</span> headers<span class="token operator">=</span>headers<span class="token punctuation">)</span><span class="token punctuation">.</span>content<span class="token punctuation">.</span>decode<span class="token punctuation">(</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span>
        sel <span class="token operator">=</span> etree<span class="token punctuation">.</span>HTML<span class="token punctuation">(</span>response<span class="token punctuation">)</span>
        <span class="token keyword">try</span><span class="token punctuation">:</span>
            title <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//h1/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\r\n'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\u3000'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>
            author <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'/html/body/section/div/div/div/div/span[2]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\r\n'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>
            time_update <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'/html/body/section/div/div/div/div/span[3]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\r\n'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>
            photo <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="highlight"]//@src'</span><span class="token punctuation">)</span>
            <span class="token keyword">if</span> len<span class="token punctuation">(</span>photo<span class="token punctuation">)</span> <span class="token operator">></span> <span class="token number">0</span><span class="token punctuation">:</span>
                time <span class="token operator">=</span> time_update<span class="token punctuation">.</span>split<span class="token punctuation">(</span><span class="token string">' '</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>split<span class="token punctuation">(</span><span class="token string">'-'</span><span class="token punctuation">)</span>
                time1 <span class="token operator">=</span> <span class="token string">'-'</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>time<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">:</span><span class="token number">2</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
                time2 <span class="token operator">=</span> time<span class="token punctuation">[</span><span class="token number">2</span><span class="token punctuation">]</span>
                result <span class="token operator">=</span> time1 <span class="token operator">+</span> <span class="token string">'/'</span> <span class="token operator">+</span> time2 <span class="token operator">+</span> <span class="token string">'/'</span>
                photo_url <span class="token operator">=</span> <span class="token string">'http://www.qstheory.cn/dukan/qs/'</span> <span class="token operator">+</span> result <span class="token operator">+</span> photo<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            <span class="token keyword">else</span><span class="token punctuation">:</span>
                photo_url <span class="token operator">=</span> photo
            content <span class="token operator">=</span> sel<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//div[@class="highlight"]/p/text() | //div[@class="highlight"]/p/strong/text()'</span><span class="token punctuation">)</span>
            content <span class="token operator">=</span> <span class="token string">''</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>content<span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\u3000\u3000'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\xa0'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'\r\n\r\n'</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span>

            data <span class="token operator">=</span> <span class="token punctuation">{</span>
                <span class="token string">'title'</span><span class="token punctuation">:</span> title<span class="token punctuation">,</span>
                <span class="token string">'author'</span><span class="token punctuation">:</span> author<span class="token punctuation">,</span>
                <span class="token string">'time'</span><span class="token punctuation">:</span> time_update<span class="token punctuation">,</span>
                <span class="token string">'photo_url'</span><span class="token punctuation">:</span> photo_url<span class="token punctuation">,</span>
                <span class="token string">'content'</span><span class="token punctuation">:</span> content
            <span class="token punctuation">}</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>data<span class="token punctuation">)</span>
            data_list<span class="token punctuation">.</span>append<span class="token punctuation">(</span>data<span class="token punctuation">)</span>
        <span class="token keyword">except</span> Exception <span class="token keyword">as</span> e<span class="token punctuation">:</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>e<span class="token punctuation">)</span>

<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'求是.csv'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">,</span> newline<span class="token operator">=</span><span class="token string">''</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    fieldnames <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">,</span> <span class="token string">'author'</span><span class="token punctuation">,</span> <span class="token string">'time'</span><span class="token punctuation">,</span> <span class="token string">'photo_url'</span><span class="token punctuation">,</span> <span class="token string">'content'</span><span class="token punctuation">]</span>
    writer <span class="token operator">=</span> csv<span class="token punctuation">.</span>DictWriter<span class="token punctuation">(</span>f<span class="token punctuation">,</span> fieldnames<span class="token operator">=</span>fieldnames<span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writeheader<span class="token punctuation">(</span><span class="token punctuation">)</span>
    writer<span class="token punctuation">.</span>writerows<span class="token punctuation">(</span>data_list<span class="token punctuation">)</span>
<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>



<span class="token comment" spellcheck="true">### 10爬取豆瓣图书（scrapy)</span>

<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>python
<span class="token comment" spellcheck="true"># 特点：里外内容合并 meta</span>
<span class="token comment" spellcheck="true"># -*- coding: utf-8 -*-</span>
<span class="token keyword">import</span> scrapy
<span class="token keyword">from</span> douban_bookPro<span class="token punctuation">.</span>items <span class="token keyword">import</span> DoubanBookproItem


<span class="token keyword">class</span> <span class="token class-name">BookSpider</span><span class="token punctuation">(</span>scrapy<span class="token punctuation">.</span>Spider<span class="token punctuation">)</span><span class="token punctuation">:</span>
    name <span class="token operator">=</span> <span class="token string">'book'</span>

    <span class="token keyword">def</span> <span class="token function">start_requests</span><span class="token punctuation">(</span>self<span class="token punctuation">)</span><span class="token punctuation">:</span>
        urls <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'https://book.douban.com/tag/%E6%97%A5%E6%9C%AC%E6%96%87%E5%AD%A6?start={}&amp;type=T'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>i<span class="token punctuation">)</span> <span class="token keyword">for</span> i <span class="token keyword">in</span>
                range<span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">,</span> <span class="token number">980</span><span class="token punctuation">,</span> <span class="token number">20</span><span class="token punctuation">)</span><span class="token punctuation">]</span>

        <span class="token keyword">for</span> url <span class="token keyword">in</span> urls<span class="token punctuation">:</span>
            <span class="token keyword">yield</span> scrapy<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>url<span class="token punctuation">,</span> callback<span class="token operator">=</span>self<span class="token punctuation">.</span>parse<span class="token punctuation">)</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span>url<span class="token punctuation">)</span>

    <span class="token keyword">def</span> <span class="token function">parse</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> response<span class="token punctuation">)</span><span class="token punctuation">:</span>
        <span class="token comment" spellcheck="true"># print(response)</span>
        detail_links <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//ul[@class="subject-list"]/li'</span><span class="token punctuation">)</span>

        <span class="token keyword">for</span> li <span class="token keyword">in</span> detail_links<span class="token punctuation">:</span>
            link <span class="token operator">=</span> li<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./div/a/@href'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
            img_url <span class="token operator">=</span> li<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./div/a/img/@src'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
            info <span class="token operator">=</span> li<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'./div[2]/div[1]/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
            info <span class="token operator">=</span> str<span class="token punctuation">(</span>info<span class="token punctuation">)</span><span class="token punctuation">.</span>split<span class="token punctuation">(</span><span class="token string">'/'</span><span class="token punctuation">)</span>
            author <span class="token operator">=</span> info<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>rstrip<span class="token punctuation">(</span><span class="token punctuation">)</span>
            translator <span class="token operator">=</span> info<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
            press <span class="token operator">=</span> info<span class="token punctuation">[</span><span class="token number">2</span><span class="token punctuation">]</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
            price <span class="token operator">=</span> info<span class="token punctuation">[</span><span class="token number">4</span><span class="token punctuation">]</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
            req <span class="token operator">=</span> scrapy<span class="token punctuation">.</span>Request<span class="token punctuation">(</span>url<span class="token operator">=</span>link<span class="token punctuation">,</span> callback<span class="token operator">=</span>self<span class="token punctuation">.</span>detail_parse<span class="token punctuation">)</span>
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'link'</span><span class="token punctuation">]</span> <span class="token operator">=</span> link
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'img_url'</span><span class="token punctuation">]</span> <span class="token operator">=</span> img_url
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'author'</span><span class="token punctuation">]</span> <span class="token operator">=</span> author
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'translator'</span><span class="token punctuation">]</span> <span class="token operator">=</span> translator
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'press'</span><span class="token punctuation">]</span> <span class="token operator">=</span> press
            req<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'price'</span><span class="token punctuation">]</span> <span class="token operator">=</span> price
            <span class="token keyword">yield</span> req

    <span class="token keyword">def</span> <span class="token function">detail_parse</span><span class="token punctuation">(</span>self<span class="token punctuation">,</span> response<span class="token punctuation">)</span><span class="token punctuation">:</span>
        title <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//h1/span/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>
        time <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-11]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">if</span> time <span class="token keyword">is</span> <span class="token string">''</span><span class="token punctuation">:</span>
            time <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-9]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        page <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-9]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token keyword">if</span> len<span class="token punctuation">(</span>page<span class="token punctuation">.</span>split<span class="token punctuation">(</span><span class="token string">'-'</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token operator">></span> <span class="token number">1</span><span class="token punctuation">:</span>
            page <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-7]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        binding <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-5]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        ISBN <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="info"]/text()[last()-1]'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        score <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="interest_sectl"]/div/div[2]/strong/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strip<span class="token punctuation">(</span><span class="token punctuation">)</span>
        comments <span class="token operator">=</span> response<span class="token punctuation">.</span>xpath<span class="token punctuation">(</span><span class="token string">'//*[@id="interest_sectl"]/div/div[2]/div/div[2]/span/a/span/text()'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>extract_first<span class="token punctuation">(</span><span class="token punctuation">)</span>

        item <span class="token operator">=</span> DoubanBookproItem<span class="token punctuation">(</span><span class="token punctuation">)</span>

        item<span class="token punctuation">[</span><span class="token string">'ISBN'</span><span class="token punctuation">]</span> <span class="token operator">=</span> ISBN
        item<span class="token punctuation">[</span><span class="token string">'title'</span><span class="token punctuation">]</span> <span class="token operator">=</span> title
        item<span class="token punctuation">[</span><span class="token string">'author'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'author'</span><span class="token punctuation">]</span>
        item<span class="token punctuation">[</span><span class="token string">'score'</span><span class="token punctuation">]</span> <span class="token operator">=</span> score
        item<span class="token punctuation">[</span><span class="token string">'price'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'price'</span><span class="token punctuation">]</span>
        item<span class="token punctuation">[</span><span class="token string">'page'</span><span class="token punctuation">]</span> <span class="token operator">=</span> page
        item<span class="token punctuation">[</span><span class="token string">'time'</span><span class="token punctuation">]</span> <span class="token operator">=</span> time
        item<span class="token punctuation">[</span><span class="token string">'press'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'press'</span><span class="token punctuation">]</span>
        item<span class="token punctuation">[</span><span class="token string">'translator'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'translator'</span><span class="token punctuation">]</span>
        item<span class="token punctuation">[</span><span class="token string">'binding'</span><span class="token punctuation">]</span> <span class="token operator">=</span> binding
        item<span class="token punctuation">[</span><span class="token string">'comments'</span><span class="token punctuation">]</span> <span class="token operator">=</span> comments
        item<span class="token punctuation">[</span><span class="token string">'link'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'link'</span><span class="token punctuation">]</span>
        item<span class="token punctuation">[</span><span class="token string">'img_url'</span><span class="token punctuation">]</span> <span class="token operator">=</span> response<span class="token punctuation">.</span>meta<span class="token punctuation">[</span><span class="token string">'img_url'</span><span class="token punctuation">]</span>

        <span class="token keyword">yield</span> item
        <span class="token keyword">print</span><span class="token punctuation">(</span>item<span class="token punctuation">)</span>

<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>





<span class="token comment" spellcheck="true"># 文档</span>

<span class="token comment" spellcheck="true">## 1.requests文档</span>

<span class="token punctuation">[</span>requests文档<span class="token punctuation">]</span><span class="token punctuation">(</span>https<span class="token punctuation">:</span><span class="token operator">//</span>requests<span class="token punctuation">.</span>readthedocs<span class="token punctuation">.</span>io<span class="token operator">/</span>zh_CN<span class="token operator">/</span>latest<span class="token operator">/</span><span class="token punctuation">)</span>

<span class="token comment" spellcheck="true">## 2.scrapy文档</span>

<span class="token punctuation">[</span>scrapy文档<span class="token punctuation">]</span><span class="token punctuation">(</span>https<span class="token punctuation">:</span><span class="token operator">//</span>scrapy<span class="token operator">-</span>chs<span class="token punctuation">.</span>readthedocs<span class="token punctuation">.</span>io<span class="token operator">/</span>zh_CN<span class="token operator">/</span><span class="token number">0.24</span><span class="token operator">/</span>intro<span class="token operator">/</span>tutorial<span class="token punctuation">.</span>html<span class="token punctuation">)</span>

<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>python
<span class="token keyword">with</span> open<span class="token punctuation">(</span><span class="token string">'qiushi.json'</span><span class="token punctuation">,</span> <span class="token string">'w'</span><span class="token punctuation">,</span> encoding<span class="token operator">=</span><span class="token string">'utf-8'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f<span class="token punctuation">:</span>
    f<span class="token punctuation">.</span>writer<span class="token punctuation">(</span>json<span class="token punctuation">.</span>dumps<span class="token punctuation">(</span>data<span class="token punctuation">,</span> indent<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span> ensure_ascii<span class="token operator">=</span><span class="token boolean">False</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    f<span class="token punctuation">.</span>writer<span class="token punctuation">(</span>json<span class="token punctuation">.</span>dumps<span class="token punctuation">(</span>data<span class="token punctuation">,</span> indent<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span> ensure_ascii<span class="token operator">=</span><span class="token boolean">False</span><span class="token punctuation">)</span><span class="token punctuation">)</span>


conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">)</span>
cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>

keys <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>data<span class="token punctuation">.</span>keys<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
values <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'%s'</span><span class="token punctuation">]</span> <span class="token operator">*</span> len<span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">)</span>
values <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'%s'</span><span class="token punctuation">]</span> <span class="token operator">*</span> len<span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">)</span>

sql <span class="token operator">=</span> <span class="token string">'insert into data({keys}) values({values})'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>keys<span class="token operator">=</span>keys<span class="token punctuation">,</span> values<span class="token operator">=</span>values<span class="token punctuation">)</span>
cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">,</span> tuple<span class="token punctuation">(</span>data<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>

keys <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span>data<span class="token punctuation">.</span>keys<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
values <span class="token operator">=</span> <span class="token string">','</span><span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'%s'</span><span class="token punctuation">]</span> <span class="token operator">*</span> len<span class="token punctuation">(</span>data<span class="token punctuation">)</span><span class="token punctuation">)</span>
sql <span class="token operator">=</span> <span class="token string">'insert into douban({keys}) values({values})'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>keys<span class="token operator">=</span>keys<span class="token punctuation">,</span> values<span class="token operator">=</span>values<span class="token punctuation">)</span>
cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">,</span> tuple<span class="token punctuation">(</span>data<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
conn<span class="token punctuation">.</span>commit<span class="token punctuation">(</span><span class="token punctuation">)</span>

conn <span class="token operator">=</span> pymysql<span class="token punctuation">.</span>connect<span class="token punctuation">(</span>host<span class="token operator">=</span><span class="token string">'localhost'</span><span class="token punctuation">,</span> port<span class="token operator">=</span><span class="token number">3306</span><span class="token punctuation">,</span> user<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> password<span class="token operator">=</span><span class="token string">'root'</span><span class="token punctuation">,</span> db<span class="token operator">=</span><span class="token string">'spiders'</span><span class="token punctuation">,</span> local_infile<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>
cursor <span class="token operator">=</span> conn<span class="token punctuation">.</span>cursor<span class="token punctuation">(</span><span class="token punctuation">)</span>
sql <span class="token operator">=</span> 'create table huang<span class="token punctuation">(</span>
id int<span class="token punctuation">(</span><span class="token number">11</span><span class="token punctuation">)</span><span class="token punctuation">,</span>  
name varchar<span class="token punctuation">(</span><span class="token number">11</span><span class="token punctuation">)</span><span class="token punctuation">,</span> 
sex varchar<span class="token punctuation">(</span><span class="token number">11</span><span class="token punctuation">)</span>
<span class="token punctuation">)</span>charset utf8'
cursor<span class="token punctuation">.</span>execute<span class="token punctuation">(</span>sql<span class="token punctuation">)</span>

sql <span class="token operator">=</span> <span class="token string">"load data local infile 'double.csv' into table double fields terminated by ','  lines terminated by '\n' ignore 1 lines"</span>




<span class="token operator">~</span><span class="token operator">~</span><span class="token operator">~</span>
















</code></pre>

                
            </div>
            <hr/>

            

    <div class="reprint" id="reprint-statement">
        
            <div class="reprint__author">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-user">
                        文章作者:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="/about" rel="external nofollow noreferrer">Aunean</a>
                </span>
            </div>
            <div class="reprint__type">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-link">
                        文章链接:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="https://www.shiyiri.top/node/1.html">https://www.shiyiri.top/node/1.html</a>
                </span>
            </div>
            <div class="reprint__notice">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-copyright">
                        版权声明:
                    </i>
                </span>
                <span class="reprint-info">
                    本博客所有文章除特別声明外，均采用
                    <a href="https://creativecommons.org/licenses/by/4.0/deed.zh" rel="external nofollow noreferrer" target="_blank">CC BY 4.0</a>
                    许可协议。转载请注明来源
                    <a href="/about" target="_blank">Aunean</a>
                    !
                </span>
            </div>
        
    </div>

    <script async defer>
      document.addEventListener("copy", function (e) {
        let toastHTML = '<span>复制成功，请遵循本文的转载规则</span><button class="btn-flat toast-action" onclick="navToReprintStatement()" style="font-size: smaller">查看</a>';
        M.toast({html: toastHTML})
      });

      function navToReprintStatement() {
        $("html, body").animate({scrollTop: $("#reprint-statement").offset().top - 80}, 800);
      }
    </script>



            <div class="tag_share" style="display: block;">
                <div class="post-meta__tag-list" style="display: inline-block;">
                    
                        <div class="article-tag">
                            
                                <a href="/tags/python%E7%88%AC%E8%99%AB/">
                                    <span class="chip bg-color">python爬虫</span>
                                </a>
                            
                        </div>
                    
                </div>
                <div class="post_share" style="zoom: 80%; width: fit-content; display: inline-block; float: right; margin: -0.15rem 0;">
                    <link rel="stylesheet" type="text/css" href="/libs/share/css/share.min.css">
<div id="article-share">

    
    <div class="social-share" data-sites="twitter,facebook,google,qq,qzone,wechat,weibo,douban,linkedin" data-wechat-qrcode-helper="<p>微信扫一扫即可分享！</p>"></div>
    <script src="/libs/share/js/social-share.min.js"></script>
    

    

</div>

                </div>
            </div>
            
                <style>
    #reward {
        margin: 40px 0;
        text-align: center;
    }

    #reward .reward-link {
        font-size: 1.4rem;
        line-height: 38px;
    }

    #reward .btn-floating:hover {
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.2), 0 5px 15px rgba(0, 0, 0, 0.2);
    }

    #rewardModal {
        width: 320px;
        height: 350px;
    }

    #rewardModal .reward-title {
        margin: 15px auto;
        padding-bottom: 5px;
    }

    #rewardModal .modal-content {
        padding: 10px;
    }

    #rewardModal .close {
        position: absolute;
        right: 15px;
        top: 15px;
        color: rgba(0, 0, 0, 0.5);
        font-size: 1.3rem;
        line-height: 20px;
        cursor: pointer;
    }

    #rewardModal .close:hover {
        color: #ef5350;
        transform: scale(1.3);
        -moz-transform:scale(1.3);
        -webkit-transform:scale(1.3);
        -o-transform:scale(1.3);
    }

    #rewardModal .reward-tabs {
        margin: 0 auto;
        width: 210px;
    }

    .reward-tabs .tabs {
        height: 38px;
        margin: 10px auto;
        padding-left: 0;
    }

    .reward-content ul {
        padding-left: 0 !important;
    }

    .reward-tabs .tabs .tab {
        height: 38px;
        line-height: 38px;
    }

    .reward-tabs .tab a {
        color: #fff;
        background-color: #ccc;
    }

    .reward-tabs .tab a:hover {
        background-color: #ccc;
        color: #fff;
    }

    .reward-tabs .wechat-tab .active {
        color: #fff !important;
        background-color: #22AB38 !important;
    }

    .reward-tabs .alipay-tab .active {
        color: #fff !important;
        background-color: #019FE8 !important;
    }

    .reward-tabs .reward-img {
        width: 210px;
        height: 210px;
    }
</style>

<div id="reward">
    <a href="#rewardModal" class="reward-link modal-trigger btn-floating btn-medium waves-effect waves-light red">赏</a>

    <!-- Modal Structure -->
    <div id="rewardModal" class="modal">
        <div class="modal-content">
            <a class="close modal-close"><i class="fas fa-times"></i></a>
            <h4 class="reward-title">你的赏识是我前进的动力</h4>
            <div class="reward-content">
                <div class="reward-tabs">
                    <ul class="tabs row">
                        <li class="tab col s6 alipay-tab waves-effect waves-light"><a href="#alipay">支付宝</a></li>
                        <li class="tab col s6 wechat-tab waves-effect waves-light"><a href="#wechat">微 信</a></li>
                    </ul>
                    <div id="alipay">
                        <img src="/medias/reward/alipay.jpg" class="reward-img" alt="支付宝打赏二维码">
                    </div>
                    <div id="wechat">
                        <img src="/medias/reward/wechat.png" class="reward-img" alt="微信打赏二维码">
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<script>
    $(function () {
        $('.tabs').tabs();
    });
</script>

            
        </div>
    </div>

    
        <link rel="stylesheet" href="/libs/gitalk/gitalk.css">
<link rel="stylesheet" href="/css/my-gitalk.css">

<div class="card gitalk-card" data-aos="fade-up">
    <div class="comment_headling" style="font-size: 20px; font-weight: 700; position: relative; padding-left: 20px; top: 15px; padding-bottom: 5px;">
        <i class="fas fa-comments fa-fw" aria-hidden="true"></i>
        <span>评论</span>
    </div>
    <div id="gitalk-container" class="card-content"></div>
</div>

<script src="/libs/gitalk/gitalk.min.js"></script>
<script>
    let gitalk = new Gitalk({
        clientID: '43918ae8f8557340e530',
        clientSecret: '016469b0d80284ccd95b27ea68909008a765d99e',
        repo: 'comment_by_blog',
        owner: 'Aunean-ls',
        admin: "Aunean-ls",
        id: '2021-04-11T12-01-00',
        distractionFreeMode: false  // Facebook-like distraction free mode
    });

    gitalk.render('gitalk-container');
</script>

    

    

    

    

    

    

    

    

<article id="prenext-posts" class="prev-next articles">
    <div class="row article-row">
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge left-badge text-color">
                <i class="fas fa-chevron-left"></i>&nbsp;上一篇</div>
            <div class="card">
                <a href="/node/5.html">
                    <div class="card-image">
                        
                        
                        <img src="https://cdn.jsdelivr.net/gh/Aunean-ls/pic/img/u022.webp" class="responsive-img" alt="kafka快速入门">
                        
                        <span class="card-title">kafka快速入门</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            本文记录kafka学习的一些笔记
                        
                    </div>
                    <div class="publish-info">
                        <span class="publish-date">
                            <i class="far fa-clock fa-fw icon-date"></i>2021-04-19
                        </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-bookmark fa-fw icon-category"></i>
                            
                            <a href="/categories/%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/" class="post-category">
                                    学习笔记
                                </a>
                            
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/kafka/">
                        <span class="chip bg-color">kafka</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge right-badge text-color">
                本篇&nbsp;<i class="far fa-dot-circle"></i>
            </div>
            <div class="card">
                <a href="/node/1.html">
                    <div class="card-image">
                        
                        
                        <img src="https://cdn.jsdelivr.net/gh/Aunean-ls/pic/img/u001.webp" class="responsive-img" alt="数据采集笔记">
                        
                        <span class="card-title">数据采集笔记</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            本文记录python爬虫学习时的一些笔记
                        
                    </div>
                    <div class="publish-info">
                            <span class="publish-date">
                                <i class="far fa-clock fa-fw icon-date"></i>2021-04-11
                            </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-bookmark fa-fw icon-category"></i>
                            
                            <a href="/categories/%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/" class="post-category">
                                    学习笔记
                                </a>
                            
                            
                        </span>
                    </div>
                </div>

                
                <div class="card-action article-tags">
                    
                    <a href="/tags/python%E7%88%AC%E8%99%AB/">
                        <span class="chip bg-color">python爬虫</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
    </div>
</article>

</div>



<!-- 代码块功能依赖 -->
<script type="text/javascript" src="/libs/codeBlock/codeBlockFuction.js"></script>

<!-- 代码语言 -->

<script type="text/javascript" src="/libs/codeBlock/codeLang.js"></script>


<!-- 代码块复制 -->

<script type="text/javascript" src="/libs/codeBlock/codeCopy.js"></script>


<!-- 代码块收缩 -->

<script type="text/javascript" src="/libs/codeBlock/codeShrink.js"></script>


    </div>
    <div id="toc-aside" class="expanded col l3 hide-on-med-and-down">
        <div class="toc-widget card" style="background-color: white;">
            <div class="toc-title"><i class="far fa-list-alt"></i>&nbsp;&nbsp;目录</div>
            <div id="toc-content"></div>
        </div>
    </div>
</div>

<!-- TOC 悬浮按钮. -->

<div id="floating-toc-btn" class="hide-on-med-and-down">
    <a class="btn-floating btn-large bg-color">
        <i class="fas fa-list-ul"></i>
    </a>
</div>


<script src="/libs/tocbot/tocbot.min.js"></script>
<script>
    $(function () {
        tocbot.init({
            tocSelector: '#toc-content',
            contentSelector: '#articleContent',
            headingsOffset: -($(window).height() * 0.4 - 45),
            collapseDepth: Number('0'),
            headingSelector: 'h2, h3, h4, h5, h6'
        });

        // modify the toc link href to support Chinese.
        let i = 0;
        let tocHeading = 'toc-heading-';
        $('#toc-content a').each(function () {
            $(this).attr('href', '#' + tocHeading + (++i));
        });

        // modify the heading title id to support Chinese.
        i = 0;
        $('#articleContent').children('h2, h3, h4, h5, h6').each(function () {
            $(this).attr('id', tocHeading + (++i));
        });

        // Set scroll toc fixed.
        let tocHeight = parseInt($(window).height() * 0.4 - 64);
        let $tocWidget = $('.toc-widget');
        $(window).scroll(function () {
            let scroll = $(window).scrollTop();
            /* add post toc fixed. */
            if (scroll > tocHeight) {
                $tocWidget.addClass('toc-fixed');
            } else {
                $tocWidget.removeClass('toc-fixed');
            }
        });

        
        /* 修复文章卡片 div 的宽度. */
        let fixPostCardWidth = function (srcId, targetId) {
            let srcDiv = $('#' + srcId);
            if (srcDiv.length === 0) {
                return;
            }

            let w = srcDiv.width();
            if (w >= 450) {
                w = w + 21;
            } else if (w >= 350 && w < 450) {
                w = w + 18;
            } else if (w >= 300 && w < 350) {
                w = w + 16;
            } else {
                w = w + 14;
            }
            $('#' + targetId).width(w);
        };

        // 切换TOC目录展开收缩的相关操作.
        const expandedClass = 'expanded';
        let $tocAside = $('#toc-aside');
        let $mainContent = $('#main-content');
        $('#floating-toc-btn .btn-floating').click(function () {
            if ($tocAside.hasClass(expandedClass)) {
                $tocAside.removeClass(expandedClass).hide();
                $mainContent.removeClass('l9');
            } else {
                $tocAside.addClass(expandedClass).show();
                $mainContent.addClass('l9');
            }
            fixPostCardWidth('artDetail', 'prenext-posts');
        });
        
    });
</script>

    

</main>




    <footer class="page-footer bg-color">
    
        <link rel="stylesheet" href="/libs/aplayer/APlayer.min.css">
<style>
    .aplayer .aplayer-lrc p {
        
        display: none;
        
        font-size: 12px;
        font-weight: 700;
        line-height: 16px !important;
    }

    .aplayer .aplayer-lrc p.aplayer-lrc-current {
        
        display: none;
        
        font-size: 15px;
        color: #42b983;
    }

    
    .aplayer.aplayer-fixed.aplayer-narrow .aplayer-body {
        left: -66px !important;
    }

    .aplayer.aplayer-fixed.aplayer-narrow .aplayer-body:hover {
        left: 0px !important;
    }

    
</style>
<div class="">
    
    <div class="row">
        <meting-js class="col l8 offset-l2 m10 offset-m1 s12"
                   server="tencent"
                   type="playlist"
                   id="4628814494"
                   fixed='true'
                   autoplay='false'
                   theme='#42b983'
                   loop='all'
                   order='random'
                   preload='auto'
                   volume='0.7'
                   list-folded='true'
        >
        </meting-js>
    </div>
</div>

<script src="/libs/aplayer/APlayer.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/meting@2/dist/Meting.min.js"></script>

    
    <div class="container row center-align" style="margin-bottom: 15px !important;">
        <div class="col s12 m8 l8 copy-right">
            Copyright&nbsp;&copy;
            
                <span id="year">2021</span>
            
            <span id="year">2021</span>
            <a href="/about" target="_blank">Aunean</a>
            |&nbsp;Powered by&nbsp;<a href="https://hexo.io/" target="_blank">Hexo</a>
            |&nbsp;Theme&nbsp;<a href="https://github.com/blinkfox/hexo-theme-matery" target="_blank">Matery</a>
            <br>
            
            &nbsp;<i class="fas fa-chart-area"></i>&nbsp;站点总字数:&nbsp;<span
                class="white-color">64.8k</span>&nbsp;字
            
            
            
            
            
            
            <span id="busuanzi_container_site_pv">
                |&nbsp;<i class="far fa-eye"></i>&nbsp;总访问量:&nbsp;<span id="busuanzi_value_site_pv"
                    class="white-color"></span>&nbsp;次
            </span>
            
            
            <span id="busuanzi_container_site_uv">
                |&nbsp;<i class="fas fa-users"></i>&nbsp;总访问人数:&nbsp;<span id="busuanzi_value_site_uv"
                    class="white-color"></span>&nbsp;人
            </span>
            
            <br>
            
            <span id="sitetime">载入运行时间...</span>
            <script>
                function siteTime() {
                    var seconds = 1000;
                    var minutes = seconds * 60;
                    var hours = minutes * 60;
                    var days = hours * 24;
                    var years = days * 365;
                    var today = new Date();
                    var startYear = "2021";
                    var startMonth = "4";
                    var startDate = "11";
                    var startHour = "0";
                    var startMinute = "0";
                    var startSecond = "0";
                    var todayYear = today.getFullYear();
                    var todayMonth = today.getMonth() + 1;
                    var todayDate = today.getDate();
                    var todayHour = today.getHours();
                    var todayMinute = today.getMinutes();
                    var todaySecond = today.getSeconds();
                    var t1 = Date.UTC(startYear, startMonth, startDate, startHour, startMinute, startSecond);
                    var t2 = Date.UTC(todayYear, todayMonth, todayDate, todayHour, todayMinute, todaySecond);
                    var diff = t2 - t1;
                    var diffYears = Math.floor(diff / years);
                    var diffDays = Math.floor((diff / days) - diffYears * 365);
                    var diffHours = Math.floor((diff - (diffYears * 365 + diffDays) * days) / hours);
                    var diffMinutes = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours) /
                        minutes);
                    var diffSeconds = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours -
                        diffMinutes * minutes) / seconds);
                    if (startYear == todayYear) {
                        document.getElementById("year").innerHTML = todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffDays + " 天 " + diffHours +
                            " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    } else {
                        document.getElementById("year").innerHTML = startYear + " - " + todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffYears + " 年 " + diffDays +
                            " 天 " + diffHours + " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    }
                }
                setInterval(siteTime, 1000);
            </script>
            
            <br>
            
        </div>
        <div class="col s12 m4 l4 social-link social-statis">
    <a href="https://github.com/Aunean-ls" class="tooltipped" target="_blank" data-tooltip="访问我的GitHub" data-position="top" data-delay="50">
        <i class="fab fa-github"></i>
    </a>



    <a href="mailto:1453357375@qq.com" class="tooltipped" target="_blank" data-tooltip="邮件联系我" data-position="top" data-delay="50">
        <i class="fas fa-envelope-open"></i>
    </a>







    <a href="tencent://AddContact/?fromId=50&fromSubId=1&subcmd=all&uin=1453357375" class="tooltipped" target="_blank" data-tooltip="QQ联系我: 1453357375" data-position="top" data-delay="50">
        <i class="fab fa-qq"></i>
    </a>







    <a href="/atom.xml" class="tooltipped" target="_blank" data-tooltip="RSS 订阅" data-position="top" data-delay="50">
        <i class="fas fa-rss"></i>
    </a>

</div>
    </div>
</footer>

<div class="progress-bar"></div>


    <!-- 搜索遮罩框 -->
<div id="searchModal" class="modal">
    <div class="modal-content">
        <div class="search-header">
            <span class="title"><i class="fas fa-search"></i>&nbsp;&nbsp;搜索</span>
            <input type="search" id="searchInput" name="s" placeholder="请输入搜索的关键字"
                   class="search-input">
        </div>
        <div id="searchResult"></div>
    </div>
</div>

<script type="text/javascript">
$(function () {
    var searchFunc = function (path, search_id, content_id) {
        'use strict';
        $.ajax({
            url: path,
            dataType: "xml",
            success: function (xmlResponse) {
                // get the contents from search data
                var datas = $("entry", xmlResponse).map(function () {
                    return {
                        title: $("title", this).text(),
                        content: $("content", this).text(),
                        url: $("url", this).text()
                    };
                }).get();
                var $input = document.getElementById(search_id);
                var $resultContent = document.getElementById(content_id);
                $input.addEventListener('input', function () {
                    var str = '<ul class=\"search-result-list\">';
                    var keywords = this.value.trim().toLowerCase().split(/[\s\-]+/);
                    $resultContent.innerHTML = "";
                    if (this.value.trim().length <= 0) {
                        return;
                    }
                    // perform local searching
                    datas.forEach(function (data) {
                        var isMatch = true;
                        var data_title = data.title.trim().toLowerCase();
                        var data_content = data.content.trim().replace(/<[^>]+>/g, "").toLowerCase();
                        var data_url = data.url;
                        data_url = data_url.indexOf('/') === 0 ? data.url : '/' + data_url;
                        var index_title = -1;
                        var index_content = -1;
                        var first_occur = -1;
                        // only match artiles with not empty titles and contents
                        if (data_title !== '' && data_content !== '') {
                            keywords.forEach(function (keyword, i) {
                                index_title = data_title.indexOf(keyword);
                                index_content = data_content.indexOf(keyword);
                                if (index_title < 0 && index_content < 0) {
                                    isMatch = false;
                                } else {
                                    if (index_content < 0) {
                                        index_content = 0;
                                    }
                                    if (i === 0) {
                                        first_occur = index_content;
                                    }
                                }
                            });
                        }
                        // show search results
                        if (isMatch) {
                            str += "<li><a href='" + data_url + "' class='search-result-title'>" + data_title + "</a>";
                            var content = data.content.trim().replace(/<[^>]+>/g, "");
                            if (first_occur >= 0) {
                                // cut out 100 characters
                                var start = first_occur - 20;
                                var end = first_occur + 80;
                                if (start < 0) {
                                    start = 0;
                                }
                                if (start === 0) {
                                    end = 100;
                                }
                                if (end > content.length) {
                                    end = content.length;
                                }
                                var match_content = content.substr(start, end);
                                // highlight all keywords
                                keywords.forEach(function (keyword) {
                                    var regS = new RegExp(keyword, "gi");
                                    match_content = match_content.replace(regS, "<em class=\"search-keyword\">" + keyword + "</em>");
                                });

                                str += "<p class=\"search-result\">" + match_content + "...</p>"
                            }
                            str += "</li>";
                        }
                    });
                    str += "</ul>";
                    $resultContent.innerHTML = str;
                });
            }
        });
    };

    searchFunc('/search.xml', 'searchInput', 'searchResult');
});
</script>

    <!-- 回到顶部按钮 -->
<div id="backTop" class="top-scroll">
    <a class="btn-floating btn-large waves-effect waves-light" href="#!">
        <i class="fas fa-arrow-up"></i>
    </a>
</div>


    <script src="/libs/materialize/materialize.min.js"></script>
    <script src="/libs/masonry/masonry.pkgd.min.js"></script>
    <script src="/libs/aos/aos.js"></script>
    <script src="/libs/scrollprogress/scrollProgress.min.js"></script>
    <script src="/libs/lightGallery/js/lightgallery-all.min.js"></script>
    <script src="/js/matery.js"></script>

    <script type="text/javascript">
        var OriginTitile=document.title,st;
        document.addEventListener("visibilitychange",function(){
            document.hidden?(document.title="ヽ(●-`Д´-)ノ页面丢失了",clearTimeout(st)):(document.title="(Ő∀Ő3)ノ又好了哦！",st=setTimeout(function(){document.title=OriginTitile},3e3))
        })
    </script>


    <script type="text/javascript">
    //只在桌面版网页启用特效
        var windowWidth = $(window).width();
        if (windowWidth > 768) {
            document.write('<script type="text/javascript" src="/js/sakura.js"><\/script>');
        }
    </script>


    <!-- Baidu Analytics -->

    <!-- Baidu Push -->

<script>
    (function () {
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        } else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
</script>

    
    <script src="/libs/others/clicklove.js" async="async"></script>
    
    
    <script async src="/libs/others/busuanzi.pure.mini.js"></script>
    

    

    

	
    
    <script type="text/javascript" color="0,0,255"
        pointColor="0,0,255" opacity='0.7'
        zIndex="-1" count="99"
        src="/libs/background/canvas-nest.js"></script>
    

    
    
    <script type="text/javascript" size="150" alpha='0.6'
        zIndex="-1" src="/libs/background/ribbon-refresh.min.js" async="async"></script>
    

    
    <script type="text/javascript" src="/libs/background/ribbon-dynamic.js" async="async"></script>
    

    
    <script src="/libs/instantpage/instantpage.js" type="module"></script>
    


</body>

</html>
